dm 'log;clear;output;clear;'; options ps=50 ls=100 pageno=1; goptions reset=global border ftext=swiss gunit=cm htext=0.4 htitle=0.5; goptions display noprompt; **********************************************************************; ** **; ** AUTHOR: Chris Bilder **; ** COURSE: STAT 873 **; ** DATE: 3-13-01 **; ** UPDATE: 8-24-03, 6-26-04 **; ** PURPOSE: Use Cluster analysis on the two variable data set **; ** **; ** NOTES: **; ** **; **********************************************************************; title1 'Chris Bilder, STAT 873'; data set1; input X1 X2; obs_numb = _n_; datalines; 2 3 3 3 4 2 5 6 7 5 4 10 4 12 5 11 17 15 18 16 18 19 19 17 19 20 20 20 ; run; title2 'The two variable data set'; proc print data=set1; run; *Scatter plot of the data; proc gplot data=set1; plot X2*X1 / vaxis=axis1 haxis=axis2 frame grid; title2 "X1 vs. X2"; symbol1 v=dot h=0.4 pointlabel=('#obs_numb') cv=blue; axis1 label = (a=90 'X2') length = 10 order = (0 to 20 by 5); axis2 label = ('X1') length = 10 order = (0 to 20 by 5); run; proc standard data=set1 out=stand_set mean=0 std=1; var X1 X2; run; *Scatter plot of the data; proc gplot data=stand_set; plot X2*X1 / vaxis=axis1 haxis=axis2 frame grid; title2 "Standadized X1 vs. standardized X2"; symbol1 v=dot h=0.4 pointlabel=('#obs_numb') cv=blue; axis1 label = (a=90 'Standadized X2') length = 12.5 order = (-1.5 to 1.5 by 1); axis2 label = ('Standadized X1') length = 12.5 order = (-1.5 to 1.5 by 1); run; title2 'Cluster analysis using single linkage'; *The NONORM option prevents the distances from being normalized to unit mean - see SAS HELP; * S gives summary statistics; * STANDARD makes PROC CLUSTER standardized the data prior to the analysis; * METHOD=SINGLE specifies single linkage (nearest neighbor) method to be used; * OUTTREE specifies information to be written to the corresponding data set for a; * tree diagram; proc cluster data=set1 method=single s standard nonorm outtree=tree1; var X1 X2; ID obs_numb; run; title2 'The tree data set'; proc print data=tree1; run; title2 'PROC TREE output'; *Sort sorts by the height variable; *ncluster specifies the number of clusters to use in the out=___ data set; proc tree data=tree1 horizontal lines=(color=blue) out=treeout1 nclusters=5; copy X1 X2; ID obs_numb; run; title2 'The treeout1 data set'; proc print data=treeout1; run; *Verify the distance calculations from PROC CLUSTER; proc iml; use stand_set; read all var{X1 X2} into X; print X; *Find Euclidean distances between standardized X1 and X2; * Note that X[1,] is the first row of X - X1=2, X2=3; * This needs to be transposed to be a column vector; d_1_2 = sqrt(t(t(X[1,])-t(X[2,]))*(t(X[1,])-t(X[2,]))); d_13_14 = sqrt(t(t(X[13,])-t(X[14,]))*(t(X[13,])-t(X[14,]))); print 'Smallest distances' d_1_2 d_13_14; *Create a matrix to save the distances; * There are 14 choose 2 rows needed; D=repeat(0,14*13/2,3); *Initialize k for loop; k=1; *Find all Euclidean distances; do i = 1 to 13; do j= i+1 to 14; D[k,3] = sqrt(t(t(X[i,])-t(X[j,]))*(t(X[i,])-t(X[j,]))); D[k,1] = i; D[k,2] = j; k=k+1; end; end; print 'All distances' D; quit; *Standardize the data for later use in verifying some of the distances; proc standard data=set1 out=stand1 mean=0 std=1; var X1 X2; run; *Helps to create the same plot as above but use standardized data for X1 and X2; data stand2; set stand1; X1_stand = X1; X2_stand = X2; keep X1_stand X2_stand obs_numb; run; proc sort data=treeout1; by obs_numb; run; data treeout2; merge treeout1 stand2; by obs_numb; run; *Scatter plot of the standardized data with clusters; proc gplot data=treeout2; plot X2_stand*X1_stand=cluster / vaxis=axis1 haxis=axis2 frame grid ; title2 "X2 vs. X1 (standardized) with clusters"; symbol1 v=dot h=0.6 cv=blue pointlabel=none; symbol2 v=square h=0.6 cv=red pointlabel=none; symbol3 v=triangle h=0.6 cv=green pointlabel=none; symbol4 v=diamond h=0.6 cv=purple pointlabel=none; symbol5 v=circle h=0.6 cv=black pointlabel=none; axis1 label = (a=90 'X2') length = 10 order = (-1.5 to 1.5 by 0.5); axis2 label = ('X1') length = 10 order = (-1.5 to 1.5 by 0.5); run; *Examine the results for many different methods; %MACRO DIFFERENT(method); title2 "PROC CLUSTER output for &method"; proc cluster data=set1 method=&method s standard outtree=tree1 pseudo CCC; var X1 X2; id obs_numb; run; title2 "PROC TREE output for &method"; proc tree data=tree1 horizontal lines=(color=blue) out=treeout1 nclusters=5; copy X1 X2; ID obs_numb; run; *Standardize the data for later use in verifying some of the distances; proc standard data=set1 out=stand1 mean=0 std=1; var X1 X2; run; *Helps to create the same plot as above but use standardized data for X1 and X2; data stand2; set stand1; X1_stand = X1; X2_stand = X2; keep X1_stand X2_stand obs_numb; run; proc sort data=treeout1; by obs_numb; run; data treeout2; merge treeout1 stand2; by obs_numb; run; title2 "The clusters for &method"; proc print data=treeout2; run; *Scatter plot of the standardized data with clusters; proc gplot data=treeout2; plot X2_stand*X1_stand=cluster / vaxis=axis1 haxis=axis2 frame grid ; title2 "X2 vs. X1 (standardized) - Clusters developed using &method"; symbol1 v=dot h=0.6 cv=blue pointlabel=none; symbol2 v=square h=0.6 cv=red pointlabel=none; symbol3 v=triangle h=0.6 cv=green pointlabel=none; symbol4 v=diamond h=0.6 cv=purple pointlabel=none; symbol5 v=circle h=0.6 cv=black pointlabel=none; axis1 label = (a=90 'X2') length = 10 order = (-1.5 to 1.5 by 0.5); axis2 label = ('X1') length = 10 order = (-1.5 to 1.5 by 0.5); run; %MEND DIFFERENT; %DIFFERENT(SINGLE); %DIFFERENT(AVERAGE); %DIFFERENT(CENTROID); %DIFFERENT(COMPLETE); %DIFFERENT(WARD); quit;