import csv,random,math,operator
import matplotlib.pyplot as plt
def loadDataset(filename, split, trainingSet=[], testSet=[]):
with open(filename, 'rb') as csvfile:
lines = csv.reader(csvfile)
dataSet=list(lines)
for i in range(len(dataSet)-1):
for j in range(len(dataSet[i])):
dataSet[i][j] = float(dataSet[i][j])
if random.random() < split:
trainingSet.append(dataSet[i])
else:
testSet.append(dataSet[i])
def getDistance(instance1, instance2, style):
distance=0.0; dot=0.0; normA=0.0; normB=0.0;
for a,b in zip(instance1, instance2):
if style == 'euclidean':
distance += (a-b)**2
elif style == 'cos':
dot += a*b
normA += a**2
normB += b**2
if style == 'euclidean':
return math.sqrt(distance)
elif style == 'cos':
if normA == 0.0 or normB == 0.0:
return None
else:
return dot / ((normA*normB)**0.5)
def getNeighbors(trainingSet, testInstance, style, k):
distance = []
for i in range(len(trainingSet)):
dist = getDistance(testInstance, trainingSet[i], style)
distance.append((trainingSet[i],dist))
distance.sort(key=operator.itemgetter(1))
neighbors = []
for i in range(k):
neighbors.append(distance[i][0])
return neighbors
def getResponse(neighbors):
classVotes = {}
for i in range(len(neighbors)):
response = neighbors[i][-1]
if response in classVotes:
classVotes[response] += 1
else:
classVotes[response] = 1
sortedVotes = sorted(classVotes.iteritems(),key=operator.itemgetter(1),reverse=True)
return sortedVotes[0][0]
def getAccuracy(testSet, predictions):
correct = 0
for i in range(len(testSet)):
if testSet[i][-1] is predictions[i]:
correct += 1
return (correct/float(len(testSet)))*100.0
def main():
trainingSet = []
testSet = []
split = 0.67
k = 4
style = 'euclidean'
loadDataset('iris.data', split, trainingSet, testSet)
predictions = []
for i in range(len(testSet)):
neighbors = getNeighbors(trainingSet, testSet[i], style, k)
ans = getResponse(neighbors)
predictions.append(ans)
print('> predicted=' + repr(ans) + '.actual=' + repr(testSet[i][-1]))
acc = getAccuracy(testSet, predictions)
print('Accuracy: ' + repr(acc) + '%')
main()