/******************************************************************************************
Syntax file to accompany the book Complex Survey Data Analysis with SAS(R) by Taylor Lewis.

last updated: 1.2.17
******************************************************************************************/


******* Read in SAS Data Sets ************************************************************;
* NOTE: assign the macro variable below the path to which you have saved the ZIP file
        of SAS data sets associated with this book;
%let path=C:\Data\; * <- just an example of a path;
libname data "&path";

*** Chapter 1;
data expenditures;
  set data.expenditures;
run;
data CLT_example_toplot;
  set data.CLT_example_toplot;
run;
data sample_means;
  set data.sample_means;
run;
data sample_SRSWOR;
  set data.sample_SRSWOR;
run;
data sample_str_SRSWOR;
  set data.sample_str_SRSWOR;
run;
data sample_clus;
  set data.sample_clus;
run;
data sample_weights;
  set data.sample_weights;
run;


*** Chapter 2;
data frame;
  set data.frame;
run;


*** Chapter 3/9;
data CBECS_2003;
  set data.CBECS_2003;
run;


*** Chapter 4/7;
data NSFG_0610_F;
  set data.NSFG_0610_F;
run;


*** Chapter 5/6/9;
data NAMCS_2010;
  set data.NAMCS_2010;
run;


*** Chapter 10/11;
data pop;
  set data.pop;
run;
data samp;
  set data.samp;
run;


****** Chapter 1: Features and Examples of Complex Surveys;

*** Figure 1.2;
proc format;
  value $y_type '1'='Variable 1: Normal Distribution'
                '2'='Variable 2: Right-Skewed'
				'3'='Variable 3: Bimodal Distribution';
run;
proc sgpanel data=CLT_example_toplot noautolegend;
  panelby y_type / novarname columns=1 rows=3 spacing=20;
  histogram y;
  format y_type $y_type.;
label y='Value of Variable';
run;


*** Figure 1.3;
proc format;
  value $y_type '1'='Variable 1: Normal Distribution'
                '2'='Variable 2: Right-Skewed'
		'3'='Variable 3: Bimodal Distribution';

  value $group  '1'='n = 15'
                '2'='n = 30'
                '3'='n = 100';
run;
proc sgpanel data=sample_means noautolegend;
  panelby y_type group / novarname columns=3 rows=3 spacing=20;
  histogram y;
  density y;
  format y_type $y_type. group $group.;
label y='Value of Sample Mean';
run;


*** Program 1.1;
title1 'Simple Random Sampling without Replacement';
title2 'Estimating a Sample Mean and its Variance Ignoring the FPC';
proc surveymeans data=sample_SRSWOR mean var;
  var exp_OTCmeds;
run;

title2 'Estimating a Sample Mean and its Variance Accounting for the FPC';
proc surveymeans data=sample_SRSWOR total=2000 mean var;
  var exp_OTCmeds;
run;
title;


*** Program 1.2;
title1 'Stratified Simple Random Sampling without Replacement';
title2 'Estimating a Sample Mean and its Variance Ignoring the Stratification';
proc surveymeans data=sample_str_SRSWOR total=2000 mean var;
  var exp_travel;
run;

data totals;
  length cityside $4;
  input cityside  _TOTAL_;
datalines;
East 1000
West 1000
;
run;
title2 'Estimating a Sample Mean and its Variance Accounting for the Stratification';
proc surveymeans data=sample_str_SRSWOR total=totals mean var;
  strata cityside;
  var exp_travel;
run;
title;


*** Figure 1.4;
proc sort data=sample_str_SRSWOR out=junk (keep=blockID adultID cityside);
  by cityside blockID; run;
data sample_str_SRSWOR;
  merge sample_str_SRSWOR
        junk;
  by cityside;
run;
proc sgplot data=sample_str_SRSWOR noautolegend;
  scatter x=adultID y=exp_travel / group=cityside markerattrs=(symbol='X');
  refline 1000 / axis=X lineattrs=(thickness=3);
run;
* NOTE: stratum mean reference lines were added to plot using Windows image editor;


*** Program 1.3;
title1 'Cluster Sampling';
title2 'Estimating a Sample Mean and its Variance Ignoring the Clustering';
proc surveymeans data=sample_clus mean var;
  var exp_OTCmeds exp_travel;
run;

title2 'Estimating a Sample Mean and its Variance Accounting for the Clustering';
proc surveymeans data=sample_clus mean var;
  cluster blockID;
  var exp_OTCmeds exp_travel;
run;
title;


*** Figure 1.5;
* visualizing the dispersion of cluster-specific means in the sample;
data toplot;
  set sample_clus (in=a rename=(exp_OTCmeds=expense))
      sample_clus (in=b rename=(exp_travel=expense));
if a then type='OTC Meds';
if b then type='Travel';
run;
proc sgpanel data=toplot;
  panelby type / uniscale=row;
  dot blockID / response=expense stat=mean limits=both;
run;


*** Program 1.4;
proc means data=sample_clus noprint nway;
  class blockID;
  var exp_OTCmeds exp_travel;
output out=cluster_means mean=;
run;

proc surveymeans data=cluster_means mean var;
  var exp_OTCmeds exp_travel;
run;


*** Program 1.5;
title1 'Sampling with Weights';
title2 'Estimating Two Sample Means and Variances Ignoring the Weights';
proc surveymeans data=sample_weights mean var;
  var exp_OTCmeds exp_travel;
run;

title2 'Estimating Two Sample Means and Variances Accounting for the Weights';
proc surveymeans data=sample_weights mean var;
  var exp_OTCmeds exp_travel;
  weight weight;
run;
title;


****** Chapter 2: Drawing Random Samples Using PROC SURVEYSELECT;

*** Program 2.1;
proc surveyselect data=frame out=sample_SRSWOR sampsize=400 method=SRS
  seed=40029;
run;


*** Program 2.2;
proc surveyselect data=frame out=sample_SRSWR sampsize=400 seed=22207
  method=URS outhits;
run;


*** Program 2.3;
proc surveyselect data=frame out=sample_SYS sampsize=400 seed=65401
  method=SYS;
run;


*** Program 2.4;
proc surveyselect data=frame out=sample_SYS_c sampsize=400 seed=94424
  method=SYS;
  control income;
run;


*** Program 2.5;
proc surveyselect data=frame out=sample_PPSWOR sampsize=400 seed=32124
  method=PPS;
  size income;
run;


*** Program 2.6;
proc sort data=frame;
  by cityside;
run;

data sampsizes;
  length cityside $4;
  input cityside  _NSIZE_;
datalines;
East 100
West 300
;
run;

proc surveyselect data=frame out=sample_STR_SRS sampsize=sampsizes
  seed=89045;
  strata cityside;
run;


*** Program 2.7;
proc surveyselect data=frame out=sample_cluster sampsize=20 seed=82908;
  cluster blockID;
run;


*** Program 2.8;
proc surveyselect data=frame out=sample_PPS_cluster sampsize=20 seed=49721
  method=PPS;
  cluster blockID;
  size income;
run;


*** Program 2.9;
proc sort data=frame;
  by cityside;
run;

data sampsizes_clusters;
  length cityside $4;
  input cityside  _NSIZE_;
datalines;
East 5
West 15
;
run;

proc surveyselect data=frame out=sample_str_cluster
  sampsize=sampsizes_clusters seed=77472;
  strata cityside;
  cluster blockID;
run;


*** Program 2.10;
proc sort data=frame;
  by cityside;
run;

data sampsizes_clusters2;
  length cityside $4;
  input cityside  _NSIZE_;
datalines;
East 10
West 30
;
run;

* Step 1) selection of PSUs (blocks);
proc surveyselect data=frame seed=78410 sampsize=sampsizes_clusters2
  out=sample_prelim (rename=(SelectionProb  = SelectProb1
                             SamplingWeight = SamplingWeight1));
  strata cityside;
  cluster blockID;
run;

* Step 2) selection of SSUs (adults within blocks);
proc surveyselect data=sample_prelim seed=64107 rate=.5
  out=sample_multistage (rename=(SelectionProb  = SelectProb2
                                 SamplingWeight = SamplingWeight2));
  strata cityside blockID;
run;

* assign variables SelectionProb and SamplingWeight to be the product of the
  corresponding variables output from the two SURVEYSELECT runs above;
data sample_multistage;
  set sample_multistage;
SelectionProb=SelectProb1*SelectProb2;
SamplingWeight=SamplingWeight1*SamplingWeight2;
run;

* verify the weights sum to known totals from the sampling frame;
proc means data=sample_multistage nway;
  class cityside;
  var SamplingWeight;
run;


****** Chapter 3: Analyzing Continuous Variables Using PROC SURVEYMEANS;

*** Program 3.1;
proc surveymeans data=CBECS_2003 sum;
  strata STRATUM8;
  cluster PAIR8;
  var SQFT8;
weight ADJWT8;
run;


*** Program 3.2;
proc format;
  value YESNO
    1='Yes'
    2='No';
run;

proc surveymeans data=CBECS_2003 sum;
  strata STRATUM8;
  cluster PAIR8;
  class STUSED8;
  var STUSED8;
weight ADJWT8;
format STUSED8 YESNO.;
run;


*** Program 3.3;
proc surveymeans data=CBECS_2003 mean;
  strata STRATUM8;
  cluster PAIR8;
  var SQFT8;
weight ADJWT8;
run;


*** Program 3.4;
proc format;
  value YESNO
    1='Yes'
    2='No';
run;

proc surveymeans data=CBECS_2003 mean;
  strata STRATUM8;
  cluster PAIR8;
  class NGUSED8;
  var NGUSED8;
weight ADJWT8;
format NGUSED8 YESNO.;
run;


*** Program 3.5;
data ratio_example;
  set CBECS_2003;
num=ADJWT8*SQFT8;
den=ADJWT8;
run;

proc surveymeans data=ratio_example ratio;
  strata STRATUM8;
  cluster PAIR8;
  var num den;
  ratio num / den;
run;


*** Program 3.6;
proc surveymeans data=CBECS_2003 ratio;
  strata STRATUM8;
  cluster PAIR8;
  var ELCNS8 SQFT8;
  ratio ELCNS8 / SQFT8;
weight ADJWT8;
run;


*** Program 3.7;
* ratio estimator of a total;
data ex_ratio_total;
  set CBECS_2003;
ELCNS8_=ELCNS8*71657900522;
run;
proc surveymeans data=ex_ratio_total ratio;
  strata STRATUM8;
  cluster PAIR8;
  var ELCNS8_ SQFT8;
  ratio ELCNS8_ / SQFT8;
weight ADJWT8;
run;

* traditional estimator of a total;
proc surveymeans data=CBECS_2003 sum;
  strata STRATUM8;
  cluster PAIR8;
  var ELCNS8;
weight ADJWT8;
run;


*** Program 3.8;
proc surveymeans data=CBECS_2003 quartiles mean;
  strata STRATUM8;
  cluster PAIR8;
  var ELEXP8;
weight ADJWT8;
run;


****** Chapter 4: Analyzing Categorical Variables Using PROC SURVEYFREQ;

*** Program 4.1;
proc format;
  value EVRMARRY
    0='NEVER MARRIED'  
    1='MARRIED';
run;

proc surveymeans data=NSFG_0610_F nobs sum mean;
  strata SEST;
  cluster SECU;
  class EVRMARRY;
  var EVRMARRY;
format EVRMARRY EVRMARRY.;
weight WGTQ1Q16;
run;

proc surveyfreq data=NSFG_0610_F;
  strata SEST;
  cluster SECU;
  tables EVRMARRY;
format EVRMARRY EVRMARRY.;
weight WGTQ1Q16;
run;


*** Program 4.2;
proc surveyfreq data=NSFG_0610_F;
  strata SEST;
  cluster SECU;
  tables PREGNUM / nowt nofreq cl;
weight WGTQ1Q16;
run;


*** Program 4.3;
proc surveyfreq data=NSFG_0610_F;
  strata SEST;
  cluster SECU;
  tables PREGNUM / nowt nofreq cl(psmall=.1 type=logit);
weight WGTQ1Q16;
run;


*** Program 4.4;
proc format;
  value PRG4PLUS
    0='NONE'  
    1='1 PREGNANCY'  
    2='2 PREGNANCIES'  
    3='3 PREGNANCIES'  
    other='4 OR MORE PREGNANCIES' ;
run;

proc surveyfreq data=NSFG_0610_F;
  strata SEST;
  cluster SECU;
  tables PREGNUM / chisq lrchisq testp=(.4 .15 .15 .15 .15);
format PREGNUM PRG4PLUS.;
weight WGTQ1Q16;
run;


*** Program 4.5;
proc format;
  value EVRMARRY
    0='NEVER MARRIED'  
    1='MARRIED';

  value RELIGION
    1='NO RELIGION'  
    2='CATHOLIC'  
    3='PROTESTANT'  
    4='OTHER RELIGIONS';
run;

proc surveyfreq data=NSFG_0610_F;
  strata SEST;
  cluster SECU;
  tables RELIGION*EVRMARRY;
format RELIGION RELIGION.
       EVRMARRY EVRMARRY.;
weight WGTQ1Q16;
run;


*** Program 4.6;
proc format;
  value EVRMARRY
    0='NEVER MARRIED'  
    1='MARRIED';

  value RELIGION
    1='NO RELIGION'  
    2='CATHOLIC'  
    3='PROTESTANT'  
    4='OTHER RELIGIONS' ;
run;

proc surveyfreq data=NSFG_0610_F;
  strata SEST;
  cluster SECU;
  tables RELIGION*EVRMARRY / chisq lrchisq;
format RELIGION RELIGION.
       EVRMARRY EVRMARRY.;
weight WGTQ1Q16;
run;


*** Program 4.7;
proc format;
  value EVRMARRY
    1='1. MARRIED'  
    0='2. NEVER MARRIED';

  value PILLR
    1='1. YES'  
    2='2. NO';
run;

proc surveyfreq data=NSFG_0610_F order=formatted;
  strata SEST;
  cluster SECU;
  tables EVRMARRY*PILLR / nofreq nocellpercent row risk relrisk;
format EVRMARRY EVRMARRY.
       PILLR PILLR.;
weight WGTQ1Q16;
run;


*** Program 4.8;
proc format;
  value RELIGION
    1='1. NOT RELIGIOUS'  
    2,3,4='2. RELIGIOUS';

  value EVRMARRY
    1='1. MARRIED'  
    0='2. NEVER MARRIED';

  value PILLR
    1='1. YES'  
    2='2. NO';
run;

ods graphics on;
proc surveyfreq data=NSFG_0610_F order=formatted;
  strata SEST;
  cluster SECU;
  tables RELIGION*EVRMARRY*PILLR / nofreq nocellpercent row relrisk
                                   plots=relriskplot;
format RELIGION RELIGION.
       EVRMARRY EVRMARRY.
       PILLR PILLR.;
weight WGTQ1Q16;
run;
ods graphics off;


****** Chapter 5: Fitting Linear Regression Models Using PROC SURVEYREG;

*** Figure 5.1;
data bivariate;
  do x=0 to 10 by .5;
  y=x + 2*rannor(34442);
  output;
  end;
run;
proc sgplot data=bivariate noautolegend;
  reg X=x Y=y /
  markerattrs=(size=8 symbol=circlefilled)
  lineattrs=(color=black thickness=3);
xaxis values=(0 to 11 by 1);
yaxis values=(0 to 11 by 1);
run;


*** Figure 5.2;
data violations;
  do x=1 to 10 by .25;
  hetero=0 + x*.4*rannor(23401);
  if x le 4 then hetero=hetero/x;
  mis=-5 + x + rannor(777548);
  serial=3*sin(x) + .5*rannor(342021);
  output;
end;
run;
data violations;
  set violations (in=a keep=X hetero rename=(hetero=y))
      violations (in=b keep=X mis rename=(mis=y))
      violations (in=c keep=X serial rename=(serial=y));
length Type $25;
if a then type='Heteroscedasticity';
if b then type='Model Misspecification';
if c then type='Serial Correlation';
run;
proc sgpanel;
  panelby type / rows=3 spacing=10;
  scatter X=x Y=y / markerattrs=(size=5 symbol=circlefilled);
refline 0 /axis=Y;
colaxis values=(1  to 10 by 1);
rowaxis values=(-5 to 5  by 5);
label x='Predicted Value'
      y='Residual';
run;


*** Program 5.1;
proc surveyreg data=NAMCS_2010;
  strata CSTRATM;
  cluster CPSUM;
  class MED SEX RACER SOLO MAJOR;
  model TIMEMD = MED SEX RACER SOLO MAJOR 
                 AGE TOTCHRON / solution;
weight PATWT;
run;


*** Program 5.2;
proc surveyreg data=NAMCS_2010;
  strata CSTRATM;
  cluster CPSUM;
  class MED SEX RACER SOLO MAJOR;
  model TIMEMD = MED SEX RACER SOLO MAJOR 
                 AGE TOTCHRON / solution;
weight PATWT;
contrast 'Test of Reduced Model'
         MED 1 -1, SEX 1 -1, RACER 1 0 -1, RACER 0 1 -1; 
run;


*** Program 5.3;
proc surveyreg data=NAMCS_2010;
  strata CSTRATM;
  cluster CPSUM;
  class SOLO MAJOR;
  model TIMEMD = SOLO MAJOR 
                 AGE TOTCHRON / ANOVA;
weight PATWT;
output out=preds p=y_hat std=se_y_hat l=y_hat_lower u=y_hat_upper;
run;

* create prediction intervals for each sampling unit;
data preds;
  set preds;
* RMSE obtained from output of PROC SURVEYREG run above;
MSE=13.8772; 
t_crit=TINV(0.975,552);
y_pred_lower=y_hat - t_crit*SQRT(se_y_hat**2 + MSE);
y_pred_upper=y_hat + t_crit*SQRT(se_y_hat**2 + MSE);
run;


*** Program 5.4;
proc surveyreg data=NAMCS_2010;
  strata CSTRATM;
  cluster CPSUM;
  class SOLO MAJOR;
  model TIMEMD = SOLO MAJOR 
                 AGE TOTCHRON;
weight PATWT;
estimate 'Case 1' INTERCEPT 1 AGE 77 TOTCHRON 1 SOLO 1 0 MAJOR 0 0 0 0 1;
estimate 'Case 2' INTERCEPT 1 AGE 45 TOTCHRON 1 SOLO 0 1 MAJOR 0 1 0 0 0;
run;


*** Program 5.5;
data addons;
  input AGE TOTCHRON SOLO MAJOR;
datalines;
77 1 1 5
45 1 2 2
;
run;

data example_expectation;
  set NAMCS_2010 
      addons;
run;

proc surveyreg data=example_expectation;
  strata CSTRATM;
  cluster CPSUM;
  class SOLO MAJOR;
  model TIMEMD = SOLO MAJOR 
                 AGE TOTCHRON;
weight PATWT;
output out=PREDS p=Y_HAT std=SE_Y_HAT;
run;

proc print data=PREDS;
  where TIMEMD=.;
  var TIMEMD SOLO MAJOR AGE TOTCHRON Y_HAT SE_Y_HAT;
run;


*** Program 5.6;
proc surveyreg data=NAMCS_2010;
  strata CSTRATM;
  cluster CPSUM;
  class SOLO MAJOR;
  model TIMEMD = SOLO MAJOR 
                 AGE TOTCHRON;
weight PATWT;
estimate 'Case 1 - Case 2' AGE 32 SOLO 1 -1 MAJOR 0 -1 0 0 1;
run;


****** Chapter 6: Fitting Logistic Regression Models Using PROC SURVEYLOGISTIC;

*** Figure 6.1;
data toplot;
  input x y1;
datalines;
-1 .
0  0
1  0
2  0
3  1
4  0
5  1
6  0
7  1
8  1
9  1
10 1
11 .
;
run;

* overlay a regression line to the plot above to illustrate how the predicted values can be "out of scope";
proc reg data=toplot;
  model y1 = x;
output out=toplot2 p=pred_ols;
run;
quit;
proc sgplot data=toplot2 noautolegend;
  scatter X=x Y=y1 /
  markerattrs=(size=8 symbol=circlefilled);
  series X=x Y=pred_ols /
  lineattrs=(color=black thickness=3);
xaxis values=(-1 to 11 by  1);
yaxis values=(-.2 to 1.2 by .2);
label y1='y';
run;


*** Figure 6.2;
data toplot;
  input x y1;
datalines;
-1 .
0  0
1  0
2  0
3  1
4  0
5  1
6  0
7  1
8  1
9  1
10 1
11 .
;
run;
data toplot3;
  do x=-1 to 11 by .1;
  output;
end;
run;
data toplot3;
  merge toplot3 toplot;
  by x;
run;
proc logistic data=toplot3;
  model y1 (event='1') = x;
output out=toplot3 p=pred_logistic;
run;
quit;
proc sgplot data=toplot3 noautolegend;
  scatter X=x Y=y1 /
  markerattrs=(size=8 symbol=circlefilled);
  series X=x Y=pred_logistic /
  lineattrs=(color=black thickness=3);
xaxis values=(-1 to 11 by  1);
yaxis values=(-.2 to 1.2 by .2);
label y1='y';
run;


*** Program 6.1;
proc surveylogistic data=NAMCS_2010;
  strata CSTRATM;
  cluster CPSUM;
  class SEX RACER MAJOR / param=ref;
  model MED (event='1') = SEX RACER MAJOR
                          AGE TOTCHRON TIMEMD;
weight PATWT; 
run;


*** Program 6.2;
proc surveylogistic data=NAMCS_2010;
  strata CSTRATM;
  cluster CPSUM;
  class SEX RACER MAJOR / param=ref;
  model MED (event='1') = SEX RACER MAJOR
                          AGE TOTCHRON TIMEMD;
weight PATWT;
contrast 'Test of Reduced Model' SEX 1, AGE 1, TIMEMD 1; 
run;


*** Program 6.3;
proc surveylogistic data=NAMCS_2010;
  strata CSTRATM;
  cluster CPSUM;
  class RACER MAJOR / param=ref;
  model MED (event='1') = RACER MAJOR
                          TOTCHRON;
weight PATWT;
output out=residuals p=p_hat;
run;

data residuals;
  set residuals;
residual=MED-p_hat;
run;

proc sort data=residuals;
  by p_hat;
run;

* assign risk decile thresholds where cumulated sum of weights exceeds (1/g)th of the total sum of weights;
data residuals;
  set residuals;
PATWT_sum=1008802005;
PATWT_running_sum+PATWT;
decile=ceil(10*(PATWT_running_sum/PATWT_sum));
run;

proc surveyreg data=residuals;
  strata CSTRATM;
  cluster CPSUM;
  class decile;
  model residual = decile / noint solution vadjust=none;
weight PATWT;
contrast 'A-L LOF Test' decile 1 1 1 1 1 1 1 1 1 1;
run;


*** Program 6.4;
proc surveylogistic data=NAMCS_2010;
  strata CSTRATM;
  cluster CPSUM;
  class RACER MAJOR / param=ref;
  model MED (event='1') = RACER MAJOR
                          TOTCHRON;
weight PATWT;
output out=preds p=p_hat l=p_hat_lower u=p_hat_upper;
run;


*** Program 6.5;
proc surveylogistic data=NAMCS_2010;
  strata CSTRATM;
  cluster CPSUM;
  class RACER MAJOR / param=ref;
  model MED (event='1') = RACER MAJOR
                          TOTCHRON;
weight PATWT;
contrast 'B vs. W Patients' RACER -1 1 / estimate=both;
run;


*** Program 6.6;
proc surveylogistic data=NAMCS_2010;
  strata CSTRATM;
  cluster CPSUM;
  class RACER MAJOR / param=ref;
  model MED (event='1') = RACER MAJOR
                          TOTCHRON;
weight PATWT;
contrast 'TOTCHRON + 3' TOTCHRON 3 / estimate=both;
run;


*** Program 6.7;
proc surveylogistic data=NAMCS_2010;
  strata CSTRATM;
  cluster CPSUM;
  class RACER MAJOR / param=ref;
  model MED (event='1') = RACER MAJOR
                          TOTCHRON 
                          RACER*TOTCHRON;
weight PATWT;
contrast 'Custom Odds Ratio' 
         MAJOR 0 1 -1 0 RACER -1 1 RACER*TOTCHRON -1 1 / estimate=both e;
run;


*** Program 6.8;
proc surveylogistic data=NAMCS_2010;
  strata CSTRATM;
  cluster CPSUM;
  class RACER MAJOR / param=ref;
  model NUMMED (ref='0') = RACER MAJOR
                           TOTCHRON / link=GLOGIT;
weight PATWT;
run;


*** Program 6.9;
proc surveylogistic data=NAMCS_2010;
  strata CSTRATM;
  cluster CPSUM;
  class RACER MAJOR / param=ref;
  model NUMMED (ref='0') = RACER MAJOR
                           TOTCHRON / link=GLOGIT;
weight PATWT; 
test
  INTERCEPT_1=INTERCEPT_2,
  RACER1_1=RACER1_2, RACER2_1=RACER2_2,
  MAJOR1_1=MAJOR1_2, MAJOR2_1=MAJOR2_2, MAJOR3_1=MAJOR3_2, MAJOR4_1=MAJOR4_2,
  TOTCHRON_1=TOTCHRON_2;
run;


*** Program 6.10;
proc surveylogistic data=NAMCS_2010;
  strata CSTRATM;
  cluster CPSUM;
  class RACER MAJOR / param=ref;
  model NUMMED = RACER MAJOR
                 TOTCHRON / link=CLOGIT;
weight PATWT;
run;


****** Chapter 7: Survival Analysis with Complex Survey Data;

*** Program 7.1;
data NSFG_F_MAR;
  set NSFG_0610_F;

* maintain only those respondents who have ever been married;
if EVRMARRY=1;

* assign an indicator of being censored observations if individual is
  still married, separated (but not divorced), or if the spouse died;
censor=(MAREND01 in(. 2 3));
run;

proc format;
  value RELIGION
    1='NO RELIGION'  
    2='CATHOLIC'  
    3='PROTESTANT'  
    4='OTHER RELIGIONS';
run;

proc lifetest data=NSFG_F_MAR method=KM plots=(survival(nocensor))
  maxtime=240;
  time MAR1DISS*censor(1);
  strata RELIGION;
weight WGTQ1Q16;
format RELIGION RELIGION.;
run;


*** Program 7.2;
proc format;
  value COHEVER
    1='YES'
    2='NO';
run;

proc lifetest data=NSFG_F_MAR method=KM
  plots=(survival(failure nocensor)) maxtime=240;
  time MAR1DISS*censor(1);
  strata COHEVER;
weight WGTQ1Q16;
format COHEVER COHEVER.;
run;


*** Program 7.3;
proc surveyphreg data=NSFG_F_MAR;
  strata SEST;
  cluster SECU;
  class RELIGION (ref='1') HISPRACE (ref='2') 
        COHEVER / param=ref;
  model MAR1DISS*censor(1) = RELIGION HISPRACE COHEVER FMAR1AGE;
weight WGTQ1Q16;
run;


*** Program 7.4;
proc surveyphreg data=NSFG_F_MAR;
  strata SEST;
  cluster SECU;
  class RELIGION (ref='1') HISPRACE (ref='2') 
        COHEVER / param=ref;
  model MAR1DISS*censor(1) = RELIGION HISPRACE COHEVER FMAR1AGE;
weight WGTQ1Q16;
estimate 'Test of Reduced Model - Dropping Religion'
         RELIGION 1 0 0, RELIGION 0 1 0, RELIGION 0 0 1 / joint;
run;


*** Program 7.5;
proc format;
  value COHEVER
    1='YES'
    2='NO';
run;

proc lifetest data=NSFG_F_MAR method=KM plots=(loglogs) maxtime=240;
  time MAR1DISS*censor(1);
  strata COHEVER;
weight WGTQ1Q16;
format COHEVER COHEVER.;
run;


*** Program 7.6;
proc surveyphreg data=NSFG_F_MAR;
  strata SEST;
  cluster SECU;
  class HISPRACE (ref='2') 
        COHEVER / param=ref;
  model MAR1DISS*censor(1) = HISPRACE FMAR1AGE COHEVER COHEVERT;
weight WGTQ1Q16;
COHEVERT=(COHEVER=1)*MAR1DISS;
run;


*** Program 7.7;
data NSFG_F_MAR_PP;
  set NSFG_F_MAR (keep=CASEID SEST SECU WGTQ1Q16
                       MAR1DISS censor
                       RELIGION HISPRACE COHEVER FMAR1AGE);
intervals=ceil(MAR1DISS/12 + .000001);
do m=1 to intervals;
   output;
end;
run;

proc sort data=NSFG_F_MAR_PP;
  by CASEID;
run;

data NSFG_F_MAR_PP;
  set NSFG_F_MAR_PP;
  by CASEID;
diss=0;
if last.CASEID and censor ne 1 then diss=1;
run;

title 'Cox Proportional Hazards Version';
proc surveyphreg data=NSFG_F_MAR;
  strata SEST;
  cluster SECU;
  class RELIGION (ref='1') HISPRACE (ref='2') 
        COHEVER / param=ref;
  model MAR1DISS*censor(1) = RELIGION HISPRACE COHEVER FMAR1AGE;
weight WGTQ1Q16;
run;

title 'Discrete-Time Hazards Version';
proc surveylogistic data=NSFG_F_MAR_PP;
  strata SEST;
  cluster SECU;
  class RELIGION (ref='1') HISPRACE (ref='2') 
        COHEVER / param=ref;
  model diss (event='1') = m RELIGION HISPRACE COHEVER FMAR1AGE / expb;
weight WGTQ1Q16;
run;
title;

*** Program 7.8;
data NSFG_F_MAR_PP2;
  set NSFG_F_MAR (keep=CASEID SEST SECU WGTQ1Q16
                       MAR1DISS censor
                       RELIGION HISPRACE COHEVER FMAR1AGE
                       DATBABY1 MARDAT01);
intervals=ceil(MAR1DISS/12 + .000001);
do m=1 to intervals;
   output;
end;
run;

proc sort data=NSFG_F_MAR_PP2;
  by CASEID;
run;

data NSFG_F_MAR_PP2;
  set NSFG_F_MAR_PP2;
  by CASEID;
* insert indicator of whether child is present during marriage interval;
children=(DATBABY1 ne . and DATBABY1 le MARDAT01+(m*12));
diss=0;
if last.CASEID and censor ne 1 then diss=1;
run;

proc surveylogistic data=NSFG_F_MAR_PP2;
  strata SEST;
  cluster SECU;
  class RELIGION (ref='1') HISPRACE (ref='2') 
        COHEVER / param=ref;
  model diss (event='1') = m RELIGION HISPRACE COHEVER FMAR1AGE children /
                           expb;
weight WGTQ1Q16;
run;


****** Chapter 8: Domain Estimation;

*** Program 8.1;
data test;
  input class class_type $ homeroom grade1 tutor $ grade2 weight;
datalines;
12 upper 1 87 Y 94 49.5
12 upper 1 89 N 89 49.5
12 upper 2 91 Y 94 48
12 upper 2 84 Y 92 48
11 upper 1 82 N 84 47.5
11 upper 1 94 N 95 47.5
11 upper 2 93 N 95 48
11 upper 2 94 Y 97 48
10 under 1 78 N 81 39
10 under 1 84 N 84 39
10 under 2 90 N 87 37.5
10 under 2 82 N 85 37.5
 9 under 1 88 N 88 40
 9 under 1 93 Y 91 40
 9 under 2 77 Y 85 48
 9 under 2 81 N 84 48
;
run;

title '1) Analyzing Proper Subsets with the BY statement';
proc sort data=test;
  by class_type;
run;
proc surveymeans data=test mean stderr;
  by class_type;
  stratum class;
  cluster homeroom;
  var grade2;
weight weight;
run;

title '2) Analyzing Proper Subsets with the DOMAIN statement';
proc surveymeans data=test mean stderr;
  stratum class;
  cluster homeroom;
  var grade2;
weight weight;
domain class_type;
run;
title;


*** Program 8.2;
title '1) Improperly Analyzing a Domain by Subsetting the Data Set';
proc surveymeans data=test mean stderr;
  where tutor='Y';
  stratum class;
  cluster homeroom;
  var grade2;
weight weight;
run;

title '2) Correctly Analyzing a Domain with the DOMAIN Statement';
proc surveymeans data=test mean stderr;
  stratum class;
  cluster homeroom;
  var grade2;
weight weight;
domain tutor;
run;
title;


*** Program 8.3;
data test_d;
  set test;
weight_d=weight*(tutor='Y') + .0000000000001;
run;

proc surveymeans data=test_d mean stderr median;
  stratum class;
  cluster homeroom;
  var grade2;
weight weight_d;
run;


*** Program 8.4;
proc format;
  value YESNO
    1='Yes'
    2='No';

  value REGION
    1='Northeast'
    2='Midwest'
    3='South'
    4='West';
run;

proc surveymeans data=CBECS_2003 nobs sum;
  strata STRATUM8;
  cluster PAIR8;
  class STUSED8;
  var STUSED8;
weight ADJWT8;
domain REGION8;
format STUSED8 YESNO. REGION8 REGION.;
run;

* an equivalent analysis using PROC SURVEYFREQ;
proc surveyfreq data=CBECS_2003;
  strata STRATUM8;
  cluster PAIR8;
  tables REGION8*STUSED8 / nopercent;
weight ADJWT8;
format STUSED8 YESNO. REGION8 REGION.;
run;


*** Program 8.5;
title1 '1) Illustrating the DOMAIN Statement in PROC SURVEYREG';
proc surveyreg data=NAMCS_2010;
  strata CSTRATM;
  cluster CPSUM;
  class SOLO MAJOR;
  model TIMEMD = SOLO MAJOR
                 AGE TOTCHRON / solution;
weight PATWT;
domain PHYS;
run;

title1 '2) Illustrating the Domain-Specific Weight Approach for Estimating Regression Coefficients';
data NAMCS_2010_d;
  set NAMCS_2010;
PATWT_d=PHYS*PATWT + .0000000000001; 
run;
proc surveyreg data=NAMCS_2010_d;
  strata CSTRATM;
  cluster CPSUM;
  class SOLO MAJOR;
  model TIMEMD = SOLO MAJOR
                 AGE TOTCHRON / solution;
weight PATWT_d;
run;
title;


*** Program 8.6;
title '1) Testing Year-End Mean Score for Underclassmen vs. Upperclassmen';
proc surveyreg data=test;
  stratum class;
  cluster homeroom;
  class class_type;
  model grade2 = class_type / solution vadjust=none;
weight weight;
run;

title '2) Testing Year-End Mean Score for Tutored Students vs. Non-Tutored Students';
proc surveyreg data=test;
  stratum class;
  cluster homeroom;
  class tutor;
  model grade2 = tutor / solution vadjust=none;
weight weight;
run;
title;


*** Program 8.7;
* difference variable approach;
data test;
  set test;
diff=grade2-grade1;
run;
proc surveymeans data=test mean stderr t;
  stratum class;
  cluster homeroom;
  var diff;
weight weight;
run;

* data stacking approach;
data test_stacked;
  set test (in=d1 rename=(grade1=grade))
      test (in=d2 rename=(grade2=grade));
domain=d2;
run;
proc surveyreg data=test_stacked;
  stratum class;
  cluster homeroom;
  model grade = domain / vadjust=none;
weight weight;
run;


*** Program 8.8;
proc surveyreg data=NAMCS_2010;
  stratum CSTRATM;
  cluster CPSUM;
  class MAJOR;
  model TIMEMD = MAJOR / vadjust=none;
weight PATWT;
lsmeans MAJOR / diff;
run;


*** Program 8.9;
proc surveyreg data=NAMCS_2010;
  stratum CSTRATM;
  cluster CPSUM;
  class SOLO;
  model MED = SOLO / vadjust=none solution clparm;
weight PATWT;
run;

proc surveyfreq data=NAMCS_2010;
  stratum CSTRATM;
  cluster CPSUM;
  tables SOLO*MED / nowt row riskdiff;
weight PATWT;
run;


*** Program 8.10;
title '1) 95% Confidence Limits without a DF Adjustment';
data test_d;
  set test;
weight_d=weight*(tutor='Y') + .0000000000001;
run;
proc surveymeans data=test_d mean stderr clm df;
  stratum class;
  cluster homeroom;
  var grade2;
weight weight_d;
run;


title '2) 95% Confidence Limits with DF Adjustment per Heeringa et al.
          (2010)';
* preliminary PROC SQL step subsets the data for strata with at least one
  PSU in the domain defined by TUTOR='Y';
proc sql;
  create table test_d_adj as
  select * 
    from test_d
   where class in(select distinct class from test_d where tutor='Y');
quit;
proc surveymeans data=test_d_adj mean stderr clm df;
  stratum class;
  cluster homeroom;
  var grade2;
weight weight_d;
run;
title;


****** Chapter 9: Replication Techniques for Variance Estimation;

*** Program 9.1;
data test;
  input class class_type $ homeroom grade1 tutor $ grade2 weight;
datalines;
12 upper 1 87 Y 94 49.5
12 upper 1 89 N 89 49.5
12 upper 2 91 Y 94 48
12 upper 2 84 Y 92 48
11 upper 1 82 N 84 47.5
11 upper 1 94 N 95 47.5
11 upper 2 93 N 95 48
11 upper 2 94 Y 97 48
10 under 1 78 N 81 39
10 under 1 84 N 84 39
10 under 2 90 N 87 37.5
10 under 2 82 N 85 37.5
 9 under 1 88 N 88 40
 9 under 1 93 Y 91 40
 9 under 2 77 Y 85 48
 9 under 2 81 N 84 48
;
run;

title '1) Demonstrating the Woodruff Algorithm for Taylor Series Linearization';
* compute the PSU-level linear substitute term;
proc sql;
  select sum(weight)        into :N_hat from test;
  select sum(weight*grade2) into :Y_hat from test;

  create table lin_subs as
  select class,homeroom,
         (1/&N_hat)*(sum(weight*grade2) - (&Y_hat/&N_hat)*sum(weight)) as u_i
    from test
   group by class,homeroom;
quit;

* estimate variance of the sum of the linear substitute with respect
  to the design;
proc surveymeans data=lin_subs varsum std;
  stratum class;
  cluster homeroom;
  var u_i;
run;

title '2) Standard PROC SURVEYMEANS Syntax for a Weighted Mean';
proc surveymeans data=test mean var stderr;
  stratum class;
  cluster homeroom;
  var grade2;
weight weight;
run;
title;


*** Program 9.2;
proc surveymeans data=test varmethod=BRR mean var stderr;
  stratum class;
  cluster homeroom;
  var grade2;
weight weight;
run;


*** Program 9.3;
proc surveymeans data=test varmethod=BRR (outweights=test_BRR)
  mean var stderr;
  stratum class;
  cluster homeroom;
  var grade2;
weight weight;
run;


*** Program 9.4;
proc surveymeans data=test_BRR varmethod=BRR mean var stderr;
  var grade2;
weight weight;
repweights RepWt_1-RepWt_8;
run;


*** Program 9.5;
proc surveymeans data=test varmethod=BRR (outweights=test_BRR_Fay fay=.3)
  mean stderr;
  stratum class;
  cluster homeroom;
  var grade2;
weight weight;
run;


*** Program 9.6;
title '1) Domain Estimation with BRR';
proc surveymeans data=test_BRR varmethod=BRR mean stderr;
  var grade2;
weight weight;
repweights RepWt_1-RepWt_8;
domain tutor;
run;

title "2) Domain Estimation with Fay's BRR";
proc surveymeans data=test_BRR_Fay varmethod=BRR (fay=.3) mean stderr;
  var grade2;
weight weight;
repweights RepWt_1-RepWt_8;
domain tutor;
run;
title;


*** Program 9.7;
proc surveymeans data=test varmethod=JK 
 (outweights=test_JK outjkcoefs=test_JK_coefs) mean stderr;
  stratum class;
  cluster homeroom;
  var grade2;
weight weight; 
run;


*** Program 9.8;
proc surveymeans data=test_JK varmethod=JK mean stderr;
  var grade2;
weight weight;
repweights RepWt_1-RepWt_8 / jkcoefs=test_JK_coefs;
run;


*** Program 9.9;
proc surveymeans data=test_JK varmethod=JK mean stderr;
  var grade2;
weight weight;
repweights RepWt_2 RepWt_3 RepWt_5 RepWt_8 / jkcoefs=0.99999;
run;


*** Program 9.10;
* compile bootstrap replicate weight macro;
* SOURCE:
Lohr, S. (2012). "Using SAS� for the Design, Analysis, and Visualization of Complex Surveys,"
Proceedings of the SAS Global Forum. Available on-line at:
http://support.sas.com/resources/papers/proceedings12/343-2012.pdf 
;
* NOTE: a modification made here is adding the macro paramater SEED to ensure 
  weight creation process is reproducible;
%macro bootwt (fulldata,wt,stratvar,psuvar,numboot,fullrep,repwt,seed);
proc sort data=&fulldata out=fulldata;
  by &stratvar &psuvar;
run;

proc sql stimer;
  create table psulist as
  select distinct &stratvar, &psuvar
    from fulldata
   order by &stratvar, &psuvar;

  create table numpsu as
  select distinct &stratvar, count(*)-1 as _nsize_
    from psulist
   group by &stratvar
   order by &stratvar;
quit;

data fulldata (drop=_nsize_);
  merge fulldata (in=inf)
        numpsu   (in=inn);
  by &stratvar;
if inf & inn;
wtmult = (_nsize_ + 1) / _nsize_;
run;

proc surveyselect data=psulist method=urs sampsize=numpsu 
                  out=repout outall reps=&numboot seed=&seed;
  strata &stratvar;
  id &psuvar;
run;

proc sort data=repout (keep=&stratvar &psuvar replicate numberhits)
          out=repout_sorted;
  by &stratvar &psuvar replicate;
run;

proc transpose data=repout_sorted (keep=&stratvar &psuvar replicate
                                        numberhits)
     out=repout_tr (keep=&stratvar &psuvar repmult:)
     prefix=repmult;
  by &stratvar &psuvar;
  id replicate;
  var numberhits;
run;

data &fullrep (drop=i repmult1-repmult&numboot wtmult);
array &repwt  (&numboot);
array repmult (&numboot);
  merge fulldata  (in=inf)
        repout_tr (in=inr);
  by &stratvar &psuvar;
  do i = 1 to &numboot;
    &repwt(i) = &wt * repmult(i) * wtmult;
  end;
run;

%mend bootwt;

%bootwt (test,weight,class,homeroom,200,test_boot,RepWt_,399448);


*** Program 9.11;
proc surveymeans data=test_boot varmethod=JK mean stderr;
  var grade2;
weight weight;
repweights RepWt_1-RepWt_200 / jkcoef=0.005025;
run;


*** Program 9.12;
data test_brr;
  set test_brr;
tutor_Y=(tutor='Y');
run;

proc surveyreg data=test_brr varmethod=BRR;
  model grade2 = tutor_Y / covb;
weight weight;
repweights RepWt_1-RepWt_8;
run;


*** Program 9.13;
title '1) Taylor Series Linearization';
proc surveylogistic data=NAMCS_2010 varmethod=Taylor;
  strata CSTRATM;
  cluster CPSUM;
  class RACER MAJOR / param=ref;
  model MED (event='1') = RACER MAJOR
                          TOTCHRON;
weight PATWT;
run;

title '2) Jackknife Repeated Replication';
proc surveylogistic data=NAMCS_2010 varmethod=JK;
  strata CSTRATM;
  cluster CPSUM;
  class RACER MAJOR / param=ref;
  model MED (event='1') = RACER MAJOR
                          TOTCHRON;
weight PATWT;
run;
title;


*** Program 9.14;
* generate BRR replicate weights;
proc surveymeans data=CBECS_2003 varmethod=BRR (outweights=CBECS_2003_brr);
  stratum STRATUM8;
  cluster PAIR8;
  var ELCNS8 SQFT8;
  ratio ELCNS8 / SQFT8;
weight ADJWT8;
domain PBA8;
run;

%macro BRR_ratiodiff(replicates);
* housecleaning;
proc sql; drop table BRR_est_all; quit;

* store full-sample estimates in macro variables;
proc sql noprint;
select sum(ADJWT8*ELCNS8)/sum(ADJWT8*SQFT8) into :ratio1 
  from CBECS_2003_brr where PBA8=16;
select sum(ADJWT8*ELCNS8)/sum(ADJWT8*SQFT8) into :ratio2
  from CBECS_2003_brr where PBA8=8;
select sum((PBA8=16)*ADJWT8*ELCNS8)/sum((PBA8=16)*ADJWT8*SQFT8) - 
       sum((PBA8=8 )*ADJWT8*ELCNS8)/sum((PBA8=8 )*ADJWT8*SQFT8)
         into :ratio_diff_FS
  from CBECS_2003_brr;
quit;

* loop through all replicate weights and get the same estimate;
%do r=1 %to &replicates;
proc sql;
  create table BRR_est_&r as
  select ratio1 - ratio2 as est from
 (select sum((PBA8=16)*RepWt_&r*ELCNS8)/sum((PBA8=16)*RepWt_&r*SQFT8) as
      ratio1
    from CBECS_2003_brr),
 (select sum((PBA8=8 )*RepWt_&r*ELCNS8)/sum((PBA8=8 )*RepWt_&r*SQFT8) as
      ratio2
    from CBECS_2003_brr);
quit;
proc datasets nolist;
  append data=BRR_est_&r base=BRR_est_all FORCE;
  delete BRR_est_&r;
run;
quit;
%end; 

* consolidate replicate-specific estimates and find squared deviations from
  full-sample estimate;
data BRR_est_all;
  set BRR_est_all;
var=(1/&replicates)*(est - &ratio_diff_FS)**2;
run;

proc sql;
  select &ratio1 as Ratio1,&ratio2 as Ratio2,
         &ratio1 - &ratio2 as Difference,
         sqrt(sum(var)) as SE_brr,
         sum(var) as var_brr
    from BRR_est_all;
quit; 
%mend;

%BRR_ratiodiff(48);


* Woodruff TSL approach to estimate the variance of the same quantity;
proc sql noprint;
  select sum(ADJWT8*ELCNS8) into :T1 from CBECS_2003 where PBA8=16;
  select sum(ADJWT8*SQFT8)  into :T2 from CBECS_2003 where PBA8=16;
  select sum(ADJWT8*ELCNS8) into :T3 from CBECS_2003 where PBA8=8;
  select sum(ADJWT8*SQFT8)  into :T4 from CBECS_2003 where PBA8=8;

  create table lin_subs as
  select STRATUM8,PAIR8,
        sum((1/&T2)*(PBA8=16)*(ADJWT8*ELCNS8 - (&T1/&T2)*ADJWT8*SQFT8) - 
            (1/&T4)*(PBA8=8 )*(ADJWT8*ELCNS8 - (&T3/&T4)*ADJWT8*SQFT8))
              as u_i
    from CBECS_2003
group by STRATUM8,PAIR8;
quit;

* estimate variance of the sum of the linear substitute with respect to the
  design;
proc surveymeans data=lin_subs varsum std;
  stratum STRATUM8;
  cluster PAIR8;
  var u_i;
run;


*** Program 9.15;
title '1) Bootstrap CI Using Default DF Calculation';
proc surveymeans data=test_boot varmethod=JK mean stderr clm df;
  var grade2;
weight weight;
repweights RepWt_1-RepWt_200 / jkcoef=0.005025;
run;

title '2) Bootstrap CI Overriding Default DF Calculation';
proc surveymeans data=test_boot varmethod=JK mean stderr clm df;
  var grade2;
weight weight;
repweights RepWt_1-RepWt_200 / jkcoef=0.005025 DF=4;
run;
title;


****** Chapter 10: Weight Adjustment Methods;

*** Program 10.1;
title 'Sum of Base Weights for Sampling units';
proc freq data=samp;
  tables gender*minority / list nocum;
weight weight_base;
run;

title 'Sum of Base Weights for Respondents';
proc freq data=samp;
  where respond=1;
  tables gender*minority / list nocum;
weight weight_base;
run;
title;


*** Program 10.2;
* sum the base weights for sampling units;
proc freq data=samp;
  tables gender*minority / out=counts_s (rename=(count=count_s));
weight weight_base;
run;

* sum the base weights for respondents;
proc freq data=samp;
  where respond=1;
  tables gender*minority / out=counts_r (rename=(count=count_r));
weight weight_base;
run;

* create data set of adjustment cell weight inflation factors;
data cell_factors;
  merge counts_s
        counts_r;
  by gender minority;
factor_adj_cell=count_s/count_r;
keep gender minority factor_adj_cell;
run;

* merge these factors into sample data set and assign a new,
  nonresponse-adjusted weight;
proc sort data=samp;         by gender minority; run;
proc sort data=cell_factors; by gender minority; run;
data samp;
  merge samp         (in=a)
        cell_factors (in=b);
  by gender minority;
if a;
weight_adj_cell=weight_base*factor_adj_cell*(respond=1);
run;

* verify sum of adjusted weights for respondents match sum of base weights
  for the full sample;
title 'Sum of Adjustment Cell Weights';
proc freq data=samp;
  where respond=1;
  tables gender*minority / list nocum;
weight weight_adj_cell;
run;
title;


*** Program 10.3;
* estimate response propensities via logistic regression;
proc logistic data=samp descending;
  class gender minority / param=ref;
  model respond (event='1') = age|gender|supervisor|minority @2;
output out=samp p=p_hat;
run;

* stratify the propensities into 5 cells;
proc rank data=samp out=samp groups=5;
  var p_hat;
  ranks propensity_cell;
run;

* sum base weights of all sampling units in a cell;
proc freq data=samp;
  tables propensity_cell / out=counts_s (rename=(count=count_s));
weight weight_base;
run;

* sum base weights of responding sampling units in a cell;
proc freq data=samp;
  where respond=1;
  tables propensity_cell / out=counts_r (rename=(count=count_r));
weight weight_base;
run;

* create data set of adjustment cell weight inflation factors;
data cell_factors;
  merge counts_s
        counts_r;
  by propensity_cell;
factor_prop_cell=count_s/count_r;
keep propensity_cell factor_prop_cell;
run;

* merge these into sample data set and assign new, nonresponse-adjusted weight;
proc sort data=samp;         by propensity_cell; run;
proc sort data=cell_factors; by propensity_cell; run;
data samp;
  merge samp         (in=a)
        cell_factors (in=b);
  by propensity_cell;
if a;
weight_adj_prop_cell=weight_base*factor_prop_cell*(respond=1);
run;


*** Program 10.4;
* create supplemental data set with control totals;
proc freq data=pop;
  tables gender*minority / list out=counts_p (rename=(count=_PSTOTAL_));
run;


* set nonrespondent weights to missing;
data samp2;
  set samp;
if respond=0 then weight_base=.;
run;
proc surveymeans data=samp2 noprint;
  weight weight_base;
poststrata gender minority / pstotal=counts_p out=samp_weight_ps;
run;


*** Program 10.5;
* create supplemental data set with adjustment cell base-weighted totals;
proc freq data=samp;
  tables gender*minority / out=counts_s (rename=(count=_PSTOTAL_));
weight weight_base;
run;

* set nonrespondent weights to missing;
data samp2;
  set samp;
* set nonrespondent weights to missing;
if respond=0 then weight_base=.;
run;

proc surveymeans data=samp2 noprint;
  weight weight_base;
poststrata gender minority / pstotal=counts_s out=samp_weight_adj_cell;
run;


*** Program 10.6;
* read in same test score data set used in Chapter 8 and 9;
data test;
  input class class_type $ homeroom grade1 tutor $ grade2 weight;
datalines;
12 upper 1 87 Y 94 49.5
12 upper 1 89 N 89 49.5
12 upper 2 91 Y 94 48
12 upper 2 84 Y 92 48
11 upper 1 82 N 84 47.5
11 upper 1 94 N 95 47.5
11 upper 2 93 N 95 48
11 upper 2 94 Y 97 48
10 under 1 78 N 81 39
10 under 1 84 N 84 39
10 under 2 90 N 87 37.5
10 under 2 82 N 85 37.5
 9 under 1 88 N 88 40
 9 under 1 93 Y 91 40
 9 under 2 77 Y 85 48
 9 under 2 81 N 84 48
;
run;

* create supplemental data set with class totals;
data class_totals;
  input class _PSTOTAL_;
datalines;
 9 400
10 360
11 395
12 405
;
run;
 
proc surveymeans data=test varmethod=BRR (outweights=test_BRR_ps) noprint;
  stratum class;
  cluster homeroom;
weight weight;
poststrata class / pstotal=class_totals;
run;


*** Program 10.7;
* create the supplemental data sets of marginal totals;
proc freq data=pop;
  tables gender     / out=gender     (drop=percent rename=(count=MRGTOTAL));
  tables agecat     / out=agecat     (drop=percent rename=(count=MRGTOTAL));
  tables minority   / out=minority   (drop=percent rename=(count=MRGTOTAL));
  tables supervisor / out=supervisor (drop=percent rename=(count=MRGTOTAL));
run;

* filter sample data set for only respondents;
data respondents;
  set samp;
where respond=1;
run;

* complile raking macro;
* SOURCE: 
Izrael, D., Hoaglin, D., and Battaglia, M. (2000). "A SAS Macro for Balancing a Weighted Sample,"
Proceedings of the SAS Users Group International (SUGI) Conference.
Available on-line at: http://www2.sas.com/proceedings/sugi25/25/st/25p258.pdf
;
%MACRO RAKING
(inds=, /* input data set */
outds=, /*output data set */
inwt=, /*weight to be raked */
freqlist=,/*list of data sets with marginal */
/*freqs or marginal control totals*/
/*they must contain name of raking*/
/* variable and either variable */
/* PERCENT or MRGTOTAL, e.g. direct*/
/* counts of marginal control totals;*/
/* by default it is assigned names of*/
/* raking variables */
outwt=, /* resulting raked weight */
byvar=, /* variable BY which user might want */
/* to set up raking. */
/* By default, raking is done */
/* over the whole input data set */
varlist=, /* list of variables to be raked on */
numvar=, /* number of raking variables */
cntotal=, /* general control total; it is */
/* required if freqlist data sets */
/* contains PERCENT; */
/* if byvar not empty, cntotal */
/* must be present for each byvar */
trmprec=1, /* tolerance,default is 1 */
numiter=50);/*number of iterations,default is 50*/
%macro reqpar (param); /** checking on required **/
%if (&&&param eq ) %then %do; /** parameters **/
%put **** Program terminated: Macro parameter
%upcase(&PARAM) missing ****;
endsas;
%end;
%mend;
%reqpar(inds) %reqpar(outds) %reqpar (inwt)
%reqpar(outwt) %reqpar(varlist)
%reqpar(numvar) %reqpar(trmprec) %reqpar(numiter)
%let varlist=%upcase(&varlist);
%let byvar=%upcase(&byvar);
%let ls=%sysfunc(getoption(ls,keyword));
%let ps=%sysfunc(getoption(ps,keyword));
/* checking on number of raking variables */
%if (%scan(&varlist,&numvar) eq ) or
(%scan(&varlist,%eval(&numvar+1)) ne )
%then %do;
%put **** Program terminated: Number of variables in the
VARLIST ****;
%put **** does not match NUMVAR ****;
endsas;
%end;
data __i0;
set &inds;
weight=&inwt;
run;
%do i=1 %to &numiter; /* loop on iteration */
%let sumterm = 0; /* cumulative termin flags*/
%do j=1 %to &numvar;/* loop on raking variable */
%let varrake= %scan(&varlist,&j);
%if (&freqlist ne ) %then
%let dsfreq=%scan(&freqlist,&j);
%else
%let dsfreq=&varrake;
proc sort data=__i0;
by &varrake;
run;
proc summary nway data=__i0 ; *** calculate;
class &varrake; *** adjusted;
var weight; ** weighted total;
output out=__i1(drop=_type_ _freq_) sum=sum&j;
run;
data __i0; ** merge with marginal data set;
merge __i0(in=_1) __i1 &dsfreq(in=_2);
by &varrake;
%if &i=1 %then %do;
if (_1 and ^_2) or (_2 and ^_1) then do;
call symput('match','1');stop;end; else
call symput('match','2');
if mrgtotal ne . then call symput('mrg','1'); else call
symput('mrg','2');
if percent ne . then call symput ('pct','1'); else call
symput('pct','2');
%end;
run;
%if &i=1 %then %do;
%if &match=1 %then %do;
%put
**** Program terminated: levels of variable &varrake do
not match ****;
%put
****in sample and marginal totals data sets ****;
endsas;
%end;
%if &pct = 1 and (&cntotal eq ) %then %do;
%put ** Program terminated: PERCENT is not missing and
CNTOTAL is missing **;
%put ** for raking variable &varrake **;
endsas;
%end;
%else
%if &pct=2 and &mrg=2 %then %do;
%put **** Program terminated: Both PERCENT and MRGTOTAL
are missing ****;
endsas;
%end;
%end;
data __i0;
set __i0;
%if (&cntotal ne ) %then %do;
if mrgtotal ne . then /*case of marginal totals*/
cntmarg=mrgtotal;
else
if percent ne . then /*case of marginal freqs*/
cntmarg=&cntotal.*percent/100;
%end;
%else %do;
if mrgtotal ne . then /*case of marginal totals*/
cntmarg=mrgtotal;
%end;
weight=weight*cntmarg/sum&j;***actual raking;
drop percent mrgtotal;
run;
data __i2(keep=&varrake sum&j cntmarg differ);
set __i0;
by &varrake;
if first.&varrake;
differ=cntmarg-sum&j;
run;
proc print label data=__i2; **print diagnostics;
%if (&byvar ne) %then %do;
title3 "Raking &byvar - &s by &varrake, iteration - &i";
%end; %else %do;
title3 "Raking by &varrake, iteration - &i ";
%end;
sum sum&j cntmarg;
label sum&j ='Calculated margin'
differ='Difference'
cntmarg='Margin Control Total';
run;
data __i2;
set __i2 end=eof;
retain comm 0; *** termination test;
if abs( differ)>&trmprec then comm=1;
if eof and comm=1 then
call symput("term&j",'2'); ** continue with iterations;
else
if eof then
call symput("term&j",'1'); ** convergence ;
run; ** achieved;
%let sumterm=%eval(&sumterm+&&term&j);
%end;
data __i0;
set __i0;
drop sum1-sum&numvar;
%if (&byvar ne ) %then
%put &byvar=&s iteration=&i numvar=&numvar
sumterm=&sumterm;
%else
%put numvar=&numvar sumterm=&sumterm;
%if &sumterm=&numvar or &i=&numiter %then %do;
/** termination criterion met **/
%if (&byvar ne ) %then
%put **** Terminated &byvar &s at &i-th iteration;
%else
%put *** Task terminated at &i-th iteration ****;
title3 ' ';
data _null_; *** write diagostics to listing;
set __i0;
if _n_=1;
file print &ps &ls;
put ' ';
%if &sumterm=&numvar %then %do; ** convergence;
%if (&byvar ne ) %then %do; ** achieved;
put "**** Program for &byvar &s terminated at iteration &i
because all calculated margins";
%end; %else %do;
put "**** Program terminated at iteration &i because all calculated margins";
%end;
put "differ from Marginal Control Totals by less than &trmprec";
run;
%end;
%else %do; ** no convergence ;
%if (&byvar ne ) %then %do;
put "**** Program for &byvar &s terminated at iteration &i";
%end; %else %do;
put "**** Program terminated at iteration &i";
%end;
put "**** No convergence achieved";
%end;
data &outds(drop=cntmarg); *** write output;
set __i0; *** data set;
rename weight=&outwt;
%let i=&numiter;
%end;
%end;
proc datasets library=work nolist mt=data;
delete __i0 __i1 __i2; ** purge work data;
run;
quit;
%mend;


* execute the macro;
%RAKING(
inds=respondents, 
outds=respondents_raked, 
inwt=weight_base, 
freqlist=gender agecat minority supervisor,
outwt=weight_raked, 
varlist=gender agecat minority supervisor,
numvar=4,
cntotal=,
trmprec=1, 
numiter=50
);


****** Chapter 11: Imputation Methods;

*** Figure 11.1;
data respondents_toplot;
  set samp;
* pick a random sample of about 100 data points;
if respond and 200 le _N_ le 400;
if (los - 20) gt age then los=los-3*abs(rannor(4220)); 
run;
proc sgplot data=respondents_toplot noautolegend;
  reg y=los x=age / 
  markerattrs=(size=5 symbol=circlefilled color=black)
  lineattrs=(color=black pattern=5 thickness=3);
xaxis label='Age of Employee';
yaxis label='Length of Service';
run;
quit;


*** Figure 11.2;
proc sgplot data=respondents_toplot noautolegend;
  vbar gender / response=Q1 stat=mean fillattrs=(color=black);
xaxis label='Employee Gender';
yaxis label='Proportion Reacting Positively to Q1';
run;


*** Program 11.1;
proc mi data=samp out=samp_MI_5 nimpute=5 seed=943222;
  class gender minority;
  var gender minority age supervisor los;
  monotone reg (los=age supervisor gender minority / details);
run;


*** Program 11.2;
proc mi data=samp nimpute=5 seed=7000726 out=samp_MI_5;
  class gender minority Q1;
  var gender minority age supervisor Q1;
  monotone logistic (Q1=gender minority age supervisor / details);
run;


*** Program 11.3;
proc mi data=samp nimpute=5 seed=655231 out=samp_MI_5;
  class gender minority Q1_full;
  var gender minority age supervisor Q1_full;
  monotone logistic (Q1_full=gender minority age supervisor / details);
run;


*** Program 11.4;
proc mi data=samp nimpute=5 seed=609210 out=samp_MI_5;
  class gender minority Q1_full;
  var gender minority age supervisor Q1_full;
  monotone logistic (Q1_full=gender minority age supervisor /
                     link=glogit details);
run;


*** Program 11.5;
* get counts of observed and missing cases within cells;
proc sql;
  create table counts_obs as
  select gender,minority,count(*) as _NSIZE_
    from samp
    where Q1_full ne .
    group by gender,minority;

  create table counts_miss as
  select gender,minority,count(*) as _NSIZE_
    from samp
    where Q1_full = .
    group by gender,minority;
quit;

* select donors within cells;
proc sort data=samp;
  by gender minority;
run;

* first step of ABB is to select WR sample of observed cases;
proc surveyselect data=samp (where=(Q1_full ne .)) 
  sampsize=counts_obs method=URS seed=5441254 reps=5
  outhits out=ABB_part1 (keep=replicate gender minority Q1_full);
  strata gender minority;
run;

* second step is to select WR sample of missing cases;
* create supplemental sample size file with the M=5 replicate codes;
data counts_miss;
  set counts_miss;
do replicate=1 to 5;
  output;
end;
run;
proc sort data=counts_miss;
  by replicate gender minority;
run;
proc sort data=ABB_part1;
  by replicate gender minority;
run;
* select the sample of donors;
proc surveyselect data=ABB_part1 
  sampsize=counts_miss method=URS seed=8744063
  outhits out=ABB_part2 (keep=replicate gender minority Q1_full);
  strata replicate gender minority;
run;

* randomly sort donor values within a cell;
data ABB_part2;
  set ABB_part2;
random=ranuni(34234433);
run;
proc sort data=ABB_part2 out=ABB_part2 (drop=random);
  by replicate gender minority random;
run;

* merge donor values into SAMP to create M=5 completed data sets;
data samp_MI_5;
  set samp;
do replicate=1 to 5;
  output;
end;
run;

* sort stacked completed data sets such that missing values appear first;
proc sort data=samp_MI_5;
  by replicate gender minority Q1_full;
run;
data samp_MI_5;
  merge samp_MI_5
        ABB_part2 (in=b);
  by replicate gender minority;
run;


*** Program 11.6;
proc mi data=samp nimpute=5 seed=655231 out=samp_MI_5;
  class gender minority;
  var gender minority age supervisor Q1_full;
  monotone propensity (Q1_full=age|gender|supervisor|minority @2);
run;


*** Program 11.7;
proc mi data=samp nimpute=5 seed=2311109 out=samp_MI_5;
  class gender minority;
  var gender minority age supervisor LOS;
  monotone regpmm (LOS=age gender supervisor minority / k=15);
run;


*** Program 11.8;
proc mi data=samp nimpute=5 seed=2311002 out=samp_MI_5;
  class gender minority Q1;
  var age supervisor gender minority LOS Q1;
  monotone reg      (LOS / details); 
  monotone logistic (Q1  / details);
run;


*** Program 11.9;
data samp_arb;
  set samp;
if LOS ne . and ranuni(882377) < .25 then LOS=.;
if Q1  ne . and ranuni(243113) < .1  then Q1=.;
run;

proc mi data=samp_arb nimpute=5 seed=873772 out=samp_MI_5;
  class gender minority Q1;
  var age supervisor gender minority Q1 LOS;
  fcs reg      (LOS / details); 
  fcs logistic (Q1  / details);
run;


*** Program 11.10;
* replicating the predictive mean matching imputation model approach 
  from Program 11.7;
proc mi data=samp nimpute=5 seed=2311109 out=samp_MI_5;
  class gender minority;
  var gender minority age supervisor LOS;
  monotone regpmm (LOS=age gender supervisor minority / k=15);
run;

* compute and store the M estimated means and standard errors of LOS;
proc sort data=samp_MI_5;
  by _IMPUTATION_;
run;

ods output statistics=stats_MI_5;
proc surveymeans data=samp_MI_5 mean stderr;
  by _IMPUTATION_;
  var LOS;
weight weight_base;
run;

* get the overall MI estimate and standard error;
proc mianalyze data=stats_MI_5;
  modeleffects mean;
  stderr stderr;
run;


*** Program 11.11;
* replicating the logistic regression imputation model approach 
  from Program 11.2;
proc mi data=samp nimpute=5 seed=7000726 out=samp_MI_5;
  class gender minority Q1;
  var gender minority age supervisor Q1;
  monotone logistic (Q1=gender minority age supervisor / details);
run;

ods output ParameterEstimates=betas 
           covb=cov_matrix;
proc surveylogistic data=samp_MI_5;
  by _IMPUTATION_;
  class gender minority;
  model Q1(event='1') = gender minority age supervisor / covb;
run;

* rename the variables in cov_matrix to match those in betas;
data cov_matrix;
  set cov_matrix;
if Parameter='GENDERF' then Parameter='GENDER';
if Parameter='MinorityN' then Parameter='Minority';
rename GENDERF   = GENDER 
       MinorityN = Minority;
run;

proc mianalyze parms=betas covb=cov_matrix;
  modeleffects intercept gender minority age supervisor;
run;