dm 'log;clear;output;clear;'; options ps=50 ls=100 pageno=1; goptions reset=global border ftext=swiss gunit=cm htext=0.4 htitle=0.5; goptions display noprompt; **********************************************************************; ** **; ** AUTHOR: Chris Bilder **; ** COURSE: STAT 873 **; ** DATE: 11-30-03 **; ** PURPOSE: Goblet data with K-means clustering **; ** **; ** NOTES: **; ** **; **********************************************************************; title1 'Chris Bilder, STAT 873'; data set1; infile 'c:/chris/unl/stat873/chapter 5/goblet.txt'; input goblet x1 x2 x3 x4 x5 x6; w1 = x1/x3; w2 = x2/x3; w4 = x4/x3; w5 = x5/x3; w6 = x6/x3; goblet = _n_; run; *******************************************************************************; * Initial PCA; title2 'Initial PCA investigation'; proc princomp data=set1 out=scores; var w1 w2 w4 w5 w6; run; *Note: the number of PCs needed is probably more than 3; * The plots below could still be used to try to get an initial number of clusters; *Scatter plot of the first two principal components; proc gplot data=scores; plot prin2*prin1 / vaxis=axis1 haxis=axis2 frame grid vref=0 href=0 cvref=green chref=green; title2 "Prin. Comp. #1 vs. Prin. Comp. #2"; symbol1 v=dot h=0.25 cv=blue; axis1 label = (a=90 'Prin. Comp. #2') length = 12; axis2 label = ('Prin. Comp. #1') length = 12; run; *3D scatter plot of the first three principal components; proc g3d data=scores; scatter prin2*prin1 = prin3 / grid zticknum=6 xticknum=6 yticknum=6 shape='cube' color='blue' rotate=140 tilt=40; title2 "3D scatter plot"; run; *Plots demonstrating the clusters; %MACRO PLOTS; *Note: both of these data sets are in the same order; data scores2; merge scores out_set1; run; goptions reset=global; *Scatter plot of the first two principal components; proc gplot data=scores2; plot prin2*prin1=cluster / vaxis=axis1 haxis=axis2 frame grid vref=0 href=0 cvref=green chref=green; title2 "Prin. Comp. #1 vs. Prin. Comp. #2"; symbol1 v=dot h=0.25 cv=blue; symbol2 v=square h=0.25 cv=red; symbol3 v=circle h=0.25 cv=green; symbol4 v=square h=0.25 cv=purple; symbol5 v=star h=0.25 cv=black; axis1 label = (a=90 'Prin. Comp. #2') length = 12; axis2 label = ('Prin. Comp. #1') length = 12; run; data scores3; set scores2; length shape $8; *No longer need; length color $6; *No longer need; select (cluster); when (1) do; shape = 'balloon'; color = 'blue'; end; when (2) do; shape = 'cube'; color = 'red'; end; when (3) do; shape = 'pyramid'; color = 'green'; end; when (4) do; shape = 'cylinder'; color = 'purple'; end; when (5) do; shape = 'star'; color = 'black'; end; otherwise do; shape = 'SPADE'; color = 'Brown'; end; end; run; *3D scatter plot of the first three principal components; proc g3d data=scores3; scatter prin2*prin1 = prin3 / grid zticknum=6 xticknum=6 yticknum=6 shape=shape color=color rotate=100 tilt=40; title2 "3D scatter plot of PCs"; run; %MEND PLOTS; *****************************************************************************; * K-means clustering; *Need to standardize the data before FASTCLUS; proc standard data=set1 out=stand_set1 mean=0 std=1; var w1 w2 w4 w5 w6; run; title2 'K-means clustering'; proc fastclus data=stand_set1 maxclusters=5 drift random=2342901 maxiter=10 OUT=out_set1 OUTITER OUTSEED=temp; var w1 w2 w4 w5 w6; id goblet; run; %PLOTS; *Try with 4 since it looks like two of the cluster could be joined; title2 'K-means clustering'; proc fastclus data=stand_set1 maxclusters=4 drift random=2342901 maxiter=10 OUT=out_set1 OUTITER OUTSEED=temp; var w1 w2 w4 w5 w6; id goblet; run; %PLOTS; *Try with 5 clusters and different random number; title2 'K-means clustering'; proc fastclus data=stand_set1 maxclusters=5 drift random=2342902 maxiter=10 OUT=out_set1 OUTITER OUTSEED=temp; var w1 w2 w4 w5 w6; id goblet; run; %PLOTS; *Try with 5 clusters and different random number; title2 'K-means clustering'; proc fastclus data=stand_set1 maxclusters=5 drift random=2342903 maxiter=10 OUT=out_set1 OUTITER OUTSEED=temp; var w1 w2 w4 w5 w6; id goblet; run; %PLOTS; quit;