Illustrates how to use the classes in the Extreme.Statistics.Multivariate namespace to perform hierarchical clustering and K-means clustering in IronPython.
from System import Array
import numerics
from Extreme.Mathematics import *
from Extreme.Statistics import *
from Extreme.Statistics.Multivariate import *
#/ Demonstrates how to use classes that implement
#/ hierarchical and K-means clustering.
# This QuickStart Sample demonstrates how to run two
# common multivariate analysis techniques:
# hierarchical cluster analysis and K-means cluster analysis.
#
# The classes used in this sample reside in the
# Extreme.Statistics.Multivariate namespace..
# First, our dataset, which is from
# Computer-Aided Multivariate Analysis, 4th Edition
# by A. A. Afifi, V. Clark and S. May, chapter 16
# See http:#www.ats.ucla.edu/stat/Stata/examples/cama4/default.htm
ror5 = NumericalVariable("ror5", Vector([ \
13.00, 13.00, 13.00, 12.20, 10.00, 9.80, 9.90, 10.30, \
9.50, 9.90, 7.90, 7.30, 7.80, 6.50, 9.20, 8.90, 8.40, \
9.00, 12.90, 15.20, 18.40, 9.90, 9.90, 10.20, 9.20]))
de = NumericalVariable("de", Vector([ \
.70, .70, .40, .20, .40, .50, .50, .30, \
.40, .40, .40, .60, .40, .40, 2.70, .90, \
1.20, 1.10, .30, .70, .20, 1.60, 1.10, .50, 1.00]))
salesgr5 = NumericalVariable("salesgr5", Vector([ \
20.20, 17.20, 14.50, 12.90, 13.60, 12.10, 10.20, 11.40, \
13.50, 12.10, 10.80, 15.40, 11.00, 18.70, 39.80, 27.80, \
38.70, 22.10, 16.00, 15.30, 15.00, 9.60, 17.90, 12.60, 11.60]))
eps5 = NumericalVariable("eps5", Vector([ \
15.50, 12.70, 15.10, 11.10, 8.00, 14.50, 7.00, 8.70, \
5.90, 4.20, 16.00, 4.90, 3.00, -3.10, 34.40, 23.50, \
24.60, 21.90, 16.20, 11.60, 11.60, 24.30, 15.30, 18.00, 4.50]))
npm1 = NumericalVariable("npm1", Vector([ \
7.20, 7.30, 7.90, 5.40, 6.70, 3.80, 4.80, 4.50, \
3.50, 4.60, 3.40, 5.10, 5.60, 1.30, 5.80, 6.70, \
4.90, 6.00, 5.70, 1.50, 1.60, 1.00, 1.60, .90, .80]))
pe = NumericalVariable("pe", Vector([ \
9.00, 8.00, 8.00, 9.00, 5.00, 6.00, 10.00, 9.00, \
11.00, 9.00, 7.00, 7.00, 7.00, 10.00, 21.00, 22.00, \
19.00, 19.00, 14.00, 8.00, 9.00, 6.00, 8.00, 6.00, 7.00]))
payoutr1 = NumericalVariable("payoutr1", Vector([ \
.4263980, .3806930, .4067800, .5681820, .3245440, .5108083, \
.3789130, .4819280, .5732480, .4907980, .4891300, .2722770, \
.3156460, .3840000, .3908790, .1612900, .3030300, .3033180, \
.2875000, .5989300, .5783130, .1949460, .3210700, .4537310, \
.5949660]))
variables = Array[NumericalVariable]([ ror5, de, salesgr5, eps5, npm1, pe, payoutr1 ])
collection = VariableCollection(variables)
#
# Hierarchical cluster analysis
#
print "Hierarchical clustering"
# Create the model:
hc = HierarchicalClusterAnalysis(variables)
# Rescale the variables to their Z-scores before doing the analysis:
hc.Standardize = True
# The linkage method defaults to Centroid:
hc.LinkageMethod = LinkageMethod.Centroid
# We could set the distance measure. We use the default:
hc.DistanceMeasure = DistanceMeasures.SquaredEuclidianDistance
# Compute the model:
hc.Compute()
# We can partition the cases into clusters:
partition = hc.GetClusterPartition(5)
# Individual clusters are accessed through an index, or through enumeration.
for cluster in partition:
print "Cluster {0} has {1} members.".format(cluster.Index, cluster.Size)
# And get a filter for the observations in a single cluster:
collection.Filter = partition[3].MemberFilter
print "Number of items in filtered collection:", collection.Observations.Count
collection.Filter = None
# Get a variable that shows memberships:
memberships = partition.GetMemberships()
for i in range(15, memberships.Length):
print "Observation {0} belongs to cluster {1}".format(i, memberships.GetLevelIndex(i))
# A dendrogram is a graphical representation of the clustering in the form of a tree.
# You can get all the information you need to draw a dendrogram starting from
# the root node of the dendrogram:
root = hc.DendrogramRoot
# Position and DistanceMeasure give the x and y coordinates:
print "Root position: ({0:.4f}, {1:.4f})".format(root.Position, root.DistanceMeasure)
# The left and right children:
print "Position of left child: {0:.4f}".format(root.LeftChild.Position)
print "Position of right child: {0:.4f}".format(root.RightChild.Position)
# You can also get a filter that defines a sort order suitable for
# drawing the dendrogram:
sortOrder = hc.GetDendrogramOrder()
print
#
# K-Means Clustering
#
print "K-means clustering"
# Create the model:
kmc = KMeansClusterAnalysis(variables, 3)
# Rescale the variables to their Z-scores before doing the analysis:
kmc.Standardize = True
# Compute the model:
kmc.Compute()
# We can partition the cases into clusters:
clusters = kmc.GetClusters()
# Individual clusters are accessed through an index, or through enumeration.
for cluster in clusters:
print "Cluster {0} has {1} members. Sum of squares: {2:.4f}".format(cluster.Index, cluster.Size, cluster.SumOfSquares)
print "Center: {0:.4f}".format(cluster.Center)
# The distances between clusters are also available:
print "{0:.4f}".format(kmc.GetClusterDistances())
# You can get a filter for the observations in a single cluster:
collection.Filter = clusters[1].MemberFilter
print "Number of items in filtered collection:", collection.Observations.Count
# Get a variable that shows memberships:
memberships = clusters.GetMemberships()
# And one that shows the distances to the centers:
distances = clusters.GetDistancesToCenters()
for i in range(18, memberships.Length):
print "Observation {0} belongs to cluster {1}, distance: {2:.4f}.".format(i, memberships.GetLevelIndex(i), distances[i])