car evaluation
dataset is a free dataset provided by hfh .
I now want to cluster this data set through kmeans
by studying the book "Machine Learning practice". It is found that because the car data set is a classified data set, while the kmeans clustering algorithm can only cluster numerical data.
my idea is that because the data in car is classified but size-dependent, I use pandas
to map
size_mapping = {
"low": 1,
"med": 2,
"high": 3,
"vhigh":4,
"5more":5,
"small":1,
"big":3}
however, it is not clear how to convert all the original car data sets to numeric types and import them into kmeans. The kmeans program has been typed out according to the book.
-sharp encoding:utf-8
from numpy import *
import matplotlib.pyplot as plt
def loadDataSet(filename):
dataMat = [] -sharp
fr = open(filename)
for line in fr.readlines():
curLine = line.strip().split(" ")
fltLine = map(float, curLine) -sharp mapcurLinefloat
dataMat.append(fltLine)
return dataMat
def distEclud(vecA, vecB): -sharp
return sqrt(sum(power(vecA - vecB, 2)))
def randCent(dataSet, k): -sharp k
n = shape(dataSet)[1] -sharp shapedataSet
centroids = mat(zeros((k, n))) -sharp matkncentroids
for j in range(n):
minJ = min(dataSet[:, j]) -sharp j
rangeJ = float(max(dataSet[:, j]) - minJ)
centroids[:, j] = minJ + rangeJ * random.rand(k, 1) -sharp random.rand(k,1)shape(k,1)
return centroids
def kMeans(dataSet, k, distMeas=distEclud, createCent=randCent):
m = shape(dataSet)[0] -sharp shapedataSet
clusterAssment = mat(zeros((m, 2))) -sharp m2
centroids = createCent(dataSet, k) -sharp kcreateCent()
clusterChanged = True -sharp true
print ":"
while clusterChanged:
clusterChanged = False
for i in range(m):
minDist = inf -sharp inf
minIndex = -1 -sharp
for j in range(k):
-sharp
distJI = distMeas(centroids[j, :], dataSet[i, :]) -sharp
if distJI < minDist:
minDist = distJI; minIndex = j
if clusterAssment[i,0] != minIndex: clusterChanged = True
clusterAssment[i,:] = minIndex,minDist**2
print centroids
for cent in range(k):
ptsInClust = dataSet[nonzero(clusterAssment[:,0].A==cent)[0]]
centroids[cent, :] = mean(ptsInClust, axis=0) -sharp axis=0
return centroids, clusterAssment -sharp
datMat = mat(loadDataSet("car.txt"))
myCentroids, clustAssing = kMeans(datMat, 4)
print ":\n", myCentroids
print ":\n", clustAssing