作业说明

作业要求

根据人们的个人资料,判断其年收入是否高于5000美元,典型的二元分类问题,通过logistic regression与generative model实现

数据集说明

X_train、Y_train和X_test是经过处理的数据集,可以直接使用,其他两个train.csv和test.csv为了提供额外信息

参考

https://www.cnblogs.com/HL-space/p/10785225.html
https://mrsuncodes.github.io/2020/03/19/%E6%9D%8E%E5%AE%8F%E6%AF%85%E6%9C%BA%E5%99%A8%E5%AD%A6%E4%B9%A0-%E7%AC%AC%E4%BA%8C%E8%AF%BE%E4%BD%9C%E4%B8%9A/

李宏毅老师提供的源代码

数据预处理

对每个属性做归一化,处理后将其分为训练集与测试集

https://blog.csdn.net/pipisorry/article/details/52247379
这篇文章对数据的归一化处理做了讲解,可以去这里看看,集中归一化方法都用到了

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
import numpy as np

#读取数据,路径要和自己的路径保持一致
np.random.seed(0)
X_train_fpath = './X_train2'
Y_train_fpath = './Y_train2'
X_test_fpath = './X_test2'
output_fpath = './output_{}.csv'

#将csv文件转换为numpy array
#源文件是通过‘,’作为分隔,每个值都有,无空值
with open(X_train_fpath) as f:
next(f)
X_train = np.array([line.strip('\n').split(',')[1:] for line in f], dtype = float)
with open(Y_train_fpath) as f:
next(f)
Y_train = np.array([line.strip('\n').split(',')[1] for line in f], dtype = float)
with open(X_test_fpath) as f:
next(f)
X_test = np.array([line.strip('\n').split(',')[1:] for line in f], dtype = float)

def _normalize(X, train = True, specified_column = None, X_mean = None, X_std = None):
"""
对X的特定列进行归一化处理
处理数据的过程中,训练数据的平均值和标准差会被重复使用
input:
X:要被处理的数据集
train:处理训练数据时为true,处理测试数据时为false
specified_column:特定列将会进行归一化处理,如果为None,所有列都会进行归一化处理
X_mean:训练数据的平均值
X_std:标准差
Output:
X 、X_mean、X_std
"""

if specified_column == None:
specified_column = np.arange(X.shape[1])
if train:
X_mean = np.mean(X[:, specified_column] ,0).reshape(1, -1)
X_std = np.std(X[:, specified_column], 0).reshape(1, -1)

X[:,specified_column] = (X[:, specified_column] - X_mean) / (X_std + 1e-8)

return X, X_mean, X_std

def _train_dev_split(X, Y, dev_ratio = 0.25):
# 将数据划分为training set and development set.
train_size = int(len(X) * (1 - dev_ratio))
return X[:train_size], Y[:train_size], X[train_size:], Y[train_size:]

#数据进行归一化处理
X_train, X_mean, X_std = _normalize(X_train, train = True)
X_test, _, _= _normalize(X_test, train = False, specified_column = None, X_mean = X_mean, X_std = X_std)

#数据划分为training set and development set
dev_ratio = 0.1
X_train, Y_train, X_dev, Y_dev = _train_dev_split(X_train, Y_train, dev_ratio = dev_ratio)

train_size = X_train.shape[0]
dev_size = X_dev.shape[0]
test_size = X_test.shape[0]
data_dim = X_train.shape[1]
print('Size of training set: {}'.format(train_size))
print('Size of development set: {}'.format(dev_size))
print('Size of testing set: {}'.format(test_size))
print('Dimension of data: {}'.format(data_dim))
Size of training set: 48830
Size of development set: 5426
Size of testing set: 27622
Dimension of data: 510

定义函数

这个几个函数在后面会重复使用

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
def _shuffle(X, Y):
"""
把数据的行打乱,其中X, Y为等长度数据(第一维相等)
"""
randomize = np.arange(len(X))
np.random.shuffle(randomize)
return (X[randomize], Y[randomize])

def _sigmoid(z):
"""
sigmoid函数的功能是求概率,为了防止溢出,设置了最大最小输出值
"""
return np.clip(1 / (1.0 + np.exp(-z)), 1e-8, 1 - (1e-8))

def _f(X, w, b):
"""
这个是逻辑回归函数f(w, b) = sigmoid(wi*xi + b)
Input:
X:数据集;W:为权重;b:为误差
"""
#matmul为矩阵的乘法操作
return _sigmoid(np.matmul(X, w) + b)

def _predict(X, w, b):
"""
通过调用逻辑回归函数对X的每一行返回一个真正的预测值
"""
#round将f的结果四舍五入,astype规定其为int类型,即最终返回的是0或1
return np.round(_f(X, w, b)).astype(np.int)

def _accuracy(Y_pred, Y_label):
"""
计算预测的准确性,如果预测正确,返回值为0,否则返回值为1
"""
acc = 1 - np.mean(np.abs(Y_pred - Y_label))
return acc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
def _cross_entropy_loss(y_pred, Y_label):
"""
求交叉熵
input:
y_pred:预测的可能性,为float类型向量
Y_label: 真正的标签值,为bool类型向量

output:
交叉熵
"""
cross_entropy = -np.dot(Y_label, np.log(y_pred)) - np.dot((1 - Y_label), np.log(1 - y_pred))
return cross_entropy

def _gradient(X, Y_label, w, b):
"""
这个功能是计算交叉熵损失函数的梯度,对w,b求导
"""
y_pred = _f(X, w, b)
pred_error = Y_label - y_pred
w_grad = -np.sum(pred_error * X.T, 1)
b_grad = -np.sum(pred_error)
return w_grad, b_grad

训练模型

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
# 对w,b初始化为0
w = np.zeros((data_dim,))
b = np.zeros((1,))

#对一些参数进行训练
#可以进行调整这几个参数,分别为迭代次数、小批次包含的数据个数、学习率
max_iter = 20
batch_size = 15
learning_rate = 0.1

# 把每次的损失和准确度都通过图表展示出来
#分别为训练集损失、验证集损失、训练集正确率、验证集正确率
train_loss = []
dev_loss = []
train_acc = []
dev_acc = []

# 计算参数更新的次数
step = 1

#重复多轮训练
for epoch in range(max_iter):
# 每轮开始之前打乱数据
X_train, Y_train = _shuffle(X_train, Y_train)

# 最小批次的训练
for idx in range(int(np.floor(train_size / batch_size))):
X = X_train[idx*batch_size:(idx+1)*batch_size]
Y = Y_train[idx*batch_size:(idx+1)*batch_size]

# 计算梯度
w_grad, b_grad = _gradient(X, Y, w, b)

#梯度更新,学习率也随之改变,这个学习率有点简单,直接用学习率除以更新次数的跟
w = w - learning_rate/np.sqrt(step) * w_grad
b = b - learning_rate/np.sqrt(step) * b_grad

step = step + 1

#计算训练集和验证集
y_train_pred = _f(X_train, w, b)#此时为float类型
Y_train_pred = np.round(y_train_pred)#转换为bool类型
train_acc.append(_accuracy(Y_train_pred, Y_train))
train_loss.append(_cross_entropy_loss(y_train_pred, Y_train) / train_size)

y_dev_pred = _f(X_dev, w, b)
Y_dev_pred = np.round(y_dev_pred)
dev_acc.append(_accuracy(Y_dev_pred, Y_dev))
dev_loss.append(_cross_entropy_loss(y_dev_pred, Y_dev) / dev_size)

print('Training loss: {}'.format(train_loss[-1]))
print('Development loss: {}'.format(dev_loss[-1]))
print('Training accuracy: {}'.format(train_acc[-1]))
print('Development accuracy: {}'.format(dev_acc[-1]))
Training loss: 0.265942625291867
Development loss: 0.2854362130041641
Training accuracy: 0.8858693426172435
Development accuracy: 0.8781791374861777
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
import matplotlib.pyplot as plt

# 损失曲线
plt.plot(train_loss)
plt.plot(dev_loss)
plt.title('Loss')
plt.legend(['train', 'dev'])
plt.savefig('loss.png')
plt.show()

# 准确率曲线
plt.plot(train_acc)
plt.plot(dev_acc)
plt.title('Accuracy')
plt.legend(['train', 'dev'])
plt.savefig('acc.png')
plt.show()

预测

把test的数据进行预测得到预测结果
结果保存在output2_logistic

1
2
3
4
5
6
7
8
9
10
11
12
13
14
predictions = _predict(X_test, w, b)
with open(output_fpath.format('logistic'), 'w') as f:
f.write('id,label\n')
for i, label in enumerate(predictions):
f.write('{},{}\n'.format(i, label))

# Print out the most significant weights
# 找到权重中最大的前十项,
ind = np.argsort(np.abs(w))[::-1]
with open(X_test_fpath) as f:
content = f.readline().strip('\n').split(',')
features = np.array(content)
for i in ind[0:10]:
print(features[i], w[i])
 Unemployed full-time 1.1225676433808978
 Not in universe -1.0573592082887484
 Other Rel <18 never married RP of subfamily -0.912973403518828
 Child 18+ ever marr Not in a subfamily -0.8705099085602619
 1 0.7950300190669547
 Spouse of householder -0.750112419980567
 Other Rel <18 ever marr RP of subfamily -0.7491036728017868
 Italy -0.7240219980088511
capital losses 0.6611812623046104
id 0.5751429906332644

第二种方法:generative model

训练集与测试集的处理方法与逻辑回归一样,因为generativemodel有可解析的最佳解,因此不必使用到验证集

数据处理与归一化

1
2
3
4
5
6
7
8
9
10
11
12
13
14
# Parse csv files to numpy array
with open(X_train_fpath) as f:
next(f)
X_train = np.array([line.strip('\n').split(',')[1:] for line in f], dtype = float)
with open(Y_train_fpath) as f:
next(f)
Y_train = np.array([line.strip('\n').split(',')[1] for line in f], dtype = float)
with open(X_test_fpath) as f:
next(f)
X_test = np.array([line.strip('\n').split(',')[1:] for line in f], dtype = float)

# Normalize training and testing data
X_train, X_mean, X_std = _normalize(X_train, train = True)
X_test, _, _= _normalize(X_test, train = False, specified_column = None, X_mean = X_mean, X_std = X_std)

求两个类别的平均值和协方差

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
# Compute in-class mean
#将数据的两个类别分开
X_train_0 = np.array([x for x, y in zip(X_train, Y_train) if y == 0])
X_train_1 = np.array([x for x, y in zip(X_train, Y_train) if y == 1])

mean_0 = np.mean(X_train_0, axis = 0)
mean_1 = np.mean(X_train_1, axis = 0)

# Compute in-class covariance
cov_0 = np.zeros((data_dim, data_dim))
cov_1 = np.zeros((data_dim, data_dim))

for x in X_train_0:
cov_0 += np.dot(np.transpose([x - mean_0]), [x - mean_0]) / X_train_0.shape[0]
for x in X_train_1:
cov_1 += np.dot(np.transpose([x - mean_1]), [x - mean_1]) / X_train_1.shape[0]

# Shared covariance is taken as a weighted average of individual in-class covariance.
cov = (cov_0 * X_train_0.shape[0] + cov_1 * X_train_1.shape[0]) / (X_train_0.shape[0] + X_train_1.shape[0])

1
2
3
4
5
6
7
8
9
10
11
12
13
14
# Compute inverse of covariance matrix.
# Since covariance matrix may be nearly singular, np.linalg.inv() may give a large numerical error.
# Via SVD decomposition, one can get matrix inverse efficiently and accurately.
u, s, v = np.linalg.svd(cov, full_matrices=False)
inv = np.matmul(v.T * 1 / s, u.T)

# Directly compute weights and bias
w = np.dot(inv, mean_0 - mean_1)
b = (-0.5) * np.dot(mean_0, np.dot(inv, mean_0)) + 0.5 * np.dot(mean_1, np.dot(inv, mean_1))\
+ np.log(float(X_train_0.shape[0]) / X_train_1.shape[0])

# Compute accuracy on training set
Y_train_pred = 1 - _predict(X_train, w, b)
print('Training accuracy: {}'.format(_accuracy(Y_train_pred, Y_train)))
Training accuracy: 0.873820406959599
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
# Predict testing labels
predictions = 1 - _predict(X_test, w, b)
with open(output_fpath.format('generative'), 'w') as f:
f.write('id,label\n')
for i, label in enumerate(predictions):
f.write('{},{}\n'.format(i, label))

# Print out the most significant weights
ind = np.argsort(np.abs(w))[::-1]
with open(X_test_fpath) as f:
content = f.readline().strip('\n').split(',')
features = np.array(content)
for i in ind[0:10]:
print(features[i], w[i])

1
2
3
4
5
6
7
8
9
10
Agriculture 7.5625
41 -7.5
Retail trade 6.828125
Forestry and fisheries 6.03125
29 -6.0
35 5.265625
34 -5.15625
Sales -5.1171875
Construction -5.111328125
37 -4.79296875