机器学习入门计划

  Hello! Machine Learning!

为什么要入门机器学习

  很简单,机器学习是目前时代的一个大潮流,大方向。在数据量和计算能力大幅提升之后,深层神经网络具有了工业应用的可能性,那么我们不禁把目光投向其他经典机器学习算法,如SVM,Logistic,Random Forest,隐马尔科夫链等会不会也能进行模型上的革新使得能够用于更大数据的计算;接着我们可以反思BP误差反向传播算法是否符合生物学本质,因为机器学习算法的本质我认为仍是基于数理逻辑使得机器仿真人类的行为,利用规则的相似性完成这次碳基到硅基的跃迁,随着生物技术的进步,如今流行的权值更新算法,神经元的传递过程是否会存在不合理之处,我们应该如何进行改进;再考虑到计算能力,既然GPU时代已经到来,那么在此之后会不会出现专用于机器学习的处理器?这些都是我们这个时代能够探索的内容,未来20年是我们的。

机器学习入门书单

  参考各个网站上给出的书单,我自己也整理了一份。
  理论知识部分:<机器学习>(西瓜书),,,<>

机器学习经典算法Python实现

Logistics回归

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
def sigmoid(inX):
return 1.0 / (1+exp(-inX))
def trainLogistic(tarin_X,tarin_Y,opts):
numSamples, numFeatures = np.shape(tarin_X)
alpha = opts['alpha']
maxIter = opts['maxIter']
weights = np.ones((numFeatures,1)) # Def a initial Wi matrix
for i in range(maxIter):
if opts['optimizeType'] == 'gradDescent':
output = sigmoid(tarin_X * weights)
error = tarin_Y - output
weights = weights + alpha*tarin_X.transpose() * error
elif opts['optimizeType'] == 'stocGradDescent':
for i in range(numSamples):
output = sigmoid(train_X[i, :] * weights)
error = train_y[i, 0] - output
weights = weights + alpha * train_X[i, :].transpose() * error
elif opts['optimizeType'] == 'smoothStocGradDescent': # smooth stochastic gradient descent
# randomly select samples to optimize for reducing cycle fluctuations
dataIndex = range(numSamples)
for j in range(numSamples):
alpha = 4.0 / (1.0 + j + i) + 0.01
randIndex = int(random.uniform(0, len(dataIndex)))
output = sigmoid(train_X[randIndex, :] * weights)
error = train_Y[randIndex, 0] - output
weights = weights + alpha * train_X[randIndex, :].transpose() * error
del(dataIndex[randIndex]) # during one interation, delete the optimized sample
else:
raise NameError('Not support optimize method type!')
return weights
def testLogRegres(weights , test_X, test_Y):
numSamples,numFeatures = shape(test_X)
matchCount = 0
for i in xrange(numSamples):
predict = sigmoid(test_X[i,:]*weights)[0,0]>0.5
if predict == bool(test_Y[i,0]):
matchCount+=1
acc = float(matchCount)/numSamples
return acc
def showLogRegres(weights, train_X ,train_Y):
numSamples , numFeatures = shape(train_X)
for i in xrange(numSamples):
if int(train_Y[i,0]==0):
plt.plot(train_X[i,1],train_X[i,2],'or')
elif int(train_Y[i,0]==1):
plt.plot(train_X[i,1],train_X[i,2],'ob')
plt.show()

KNN分类算法

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import csv,random,math,operator
import matplotlib.pyplot as plt
def loadDataset(filename, split, trainingSet=[], testSet=[]):
with open(filename, 'rb') as csvfile:
lines = csv.reader(csvfile)
dataSet=list(lines)
for i in range(len(dataSet)-1):
for j in range(len(dataSet[i])):
dataSet[i][j] = float(dataSet[i][j])
if random.random() < split:
trainingSet.append(dataSet[i])
else:
testSet.append(dataSet[i])
def getDistance(instance1, instance2, style):
distance=0.0; dot=0.0; normA=0.0; normB=0.0;
for a,b in zip(instance1, instance2):
if style == 'euclidean':
distance += (a-b)**2
elif style == 'cos':
dot += a*b
normA += a**2
normB += b**2
# return ans
if style == 'euclidean':
return math.sqrt(distance)
elif style == 'cos':
if normA == 0.0 or normB == 0.0:
return None
else:
return dot / ((normA*normB)**0.5)
def getNeighbors(trainingSet, testInstance, style, k):
distance = []
for i in range(len(trainingSet)):
dist = getDistance(testInstance, trainingSet[i], style)
distance.append((trainingSet[i],dist))
distance.sort(key=operator.itemgetter(1))
neighbors = []
for i in range(k):
neighbors.append(distance[i][0])
return neighbors
def getResponse(neighbors):
classVotes = {}
for i in range(len(neighbors)):
response = neighbors[i][-1]
if response in classVotes:
classVotes[response] += 1
else:
classVotes[response] = 1
sortedVotes = sorted(classVotes.iteritems(),key=operator.itemgetter(1),reverse=True)
return sortedVotes[0][0]
def getAccuracy(testSet, predictions):
correct = 0
for i in range(len(testSet)):
if testSet[i][-1] is predictions[i]:
correct += 1
return (correct/float(len(testSet)))*100.0
def main():
trainingSet = []
testSet = []
split = 0.67 # 分割比
k = 4
style = 'euclidean'
loadDataset('iris.data', split, trainingSet, testSet)
predictions = []
for i in range(len(testSet)):
neighbors = getNeighbors(trainingSet, testSet[i], style, k)
ans = getResponse(neighbors)
predictions.append(ans)
print('> predicted=' + repr(ans) + '.actual=' + repr(testSet[i][-1]))
acc = getAccuracy(testSet, predictions)
print('Accuracy: ' + repr(acc) + '%')
main()
在月色与雪色之间,你是第三种绝色。