import pandas as pd
import numpy as np
trainfile = '../data/mnist_train.csv'
testfile = '../data/mnist_test.csv'
df_train = pd.read_csv(trainfile,header=None)
df_test = pd.read_csv(testfile,header=None)
def transform(x):
if x<255/2:
return 0
else:
return 1
df_train.iloc[:,1:] = df_train.iloc[:,1:].applymap(transform)
df_test.iloc[:,1:] = df_test.iloc[:,1:].applymap(transform)
train_y = df_train.iloc[:,0]
train_x = df_train.iloc[:,1:]
P_class = train_y.value_counts(sort=False)/len(train_y)
f_sum = train_x[train_y==0].apply(np.sum,axis=0)
c_num = train_y.value_counts(sort=False)
c_0 = c_num[0]
k,N = 1,2
P_fi_c0 = (f_sum +k)/(c_0+k*N)
P_fi_c0[:5]
def train(X, Y, k=1,N=2,C=10):
# 参数说明:
# X:样本特征,pandas.DataFrame类型
# Y: 样本标签,pandas.Series类型
# k,N : 拉普拉斯平滑参数
# C: 类别数
#用numpy.array计算,
#都返回array类型
'''
P_class = np.array(Y.value_counts(sort=False)) #shape:(C,)
I_fc = np.array([X[Y==i].apply(np.sum,axis=0).tolist() for i in range(10)]) + k #shape:(10,784)
I_c = P_class + N*k
E = I_fc/I_c[:,None]
#用pandas中DataFrame,Series计算,
#返回:P_class(Series),E(DataFrame)
'''
P_class = Y.value_counts(sort=False)
I_fc = pd.DataFrame([X[Y==i].apply(np.sum,axis=0).tolist() for i in range(C)]) + k
I_c = P_class + N*k
E = I_fc.div(I_c,axis=0).values #结果为 np.array 类型
return P_class,E
P_class,E = train(train_x,train_y)
pd.DataFrame(E)
sample_x = df_test.iloc[0,1:].values.reshape(1,-1) #取一个测试样本
sample_y = df_test.iloc[0,0] #样本标签
P_class,E = train(train_x,train_y)
bz1 = np.subtract(sample_x,1)
bz2 = np.abs(bz1)
bz3 = np.subtract(bz2,E)
bz4 = np.abs(bz3)
log_p_fi_c0 = np.log(bz4) #取对数
log_p_c0 = np.log(P_class)
P_c0_X = log_p_c0 + np.sum(log_p_fi_c0,axis=1)
def predict(X,P_class,E):
bz1 = np.subtract(X,1)
bz2 = np.abs(bz1)
bz3 = np.subtract(bz2,E)
bz4 = np.abs(bz3)
log_p_fi_cj = np.log(bz4)
log_p_cj = np.log(P_class.values)
P_cj_X = np.add(log_p_cj, np.sum(log_p_fi_cj,axis=1))
return np.argmax(P_cj_X)
max_class = predict(sample_x,P_class,E)
pre_list = []
for i in range(df_test.shape[0]):
sample_x = df_test.iloc[i,1:].values.reshape(1,-1)
pre_list.append(predict(sample_x,P_class,E))
y = df_test.iloc[:,0]
acc = len(y[y==pd.Series(pre_list)])/len(y)