knaka Tech-Blog

AI, IoT, DIYエレクトロニクス, データサイエンスについて投稿予定です。

ディープラーニングで、kaggle タイタニック問題に挑戦。

index:

概要

以前の ロジスティック回帰の タイタニック問題の関連となり。
ディープラーニングで、結果を分類してみました。

環境

keras : 2.1.3
tensorflow : 1.4
python : 3.5.2

テストは、google colab など

学習データ

kaggle の上記ページから、
学習データ等を、コピーします。


train.csv
test.csv


・目的変数:
 Survived :生存したかどうか。
testデータは、Survivedが、含まれないので注意です。

・説明変数:
年齢、性別など

結果

先に結果となりますが。
78.46%
前回よりは、少しスコア上昇しましたが。まだまだですね

f:id:knaka0209:20181228163124p:plain

コード

ニューロンの数
入力: 5(説明変数 の数)
隠れ層:10
出力層:2

import numpy as np
import numpy.random as random
import scipy as sp
from pandas import Series, DataFrame
import pandas as pd
import time
# 可視化モジュール
import matplotlib.pyplot as plt
import matplotlib as mpl
# 機械学習モジュール
import sklearn
from keras.utils import np_utils
from class_net import ClassNet

#
def get_subData(src ):
    sub=src
    sub["Age"] = src["Age"].fillna( src["Age"].median())
    sub = sub.dropna()
#    sub["Embark_flg"] = sub["Embarked"].values
    sub["Embark_flg"] = sub["Embarked"]
    sub["Embark_flg"] = sub["Embark_flg"].map(lambda x: str(x).replace('C', '0') )    
    sub["Embark_flg"] = sub["Embark_flg"].map(lambda x: str(x).replace('Q', '1') )
    sub["Embark_flg"] = sub["Embark_flg"].map(lambda x: str(x).replace('S', '2') )
    sub.groupby("Embark_flg").size()
    # convert, num
    sub = sub.assign( Embark_flg=pd.to_numeric( sub.Embark_flg ))
    sub["Sex_flg"] = sub["Sex"].map(lambda x: 0 if x =='female' else 1)    
    return sub

# 標準化対応、学習。
# 学習データ
train_data = pd.read_csv("train.csv" )
test_data = pd.read_csv("test.csv" )
print( train_data.shape )
#print( train_data.head() )
#
# 前処理 ,欠損データ 中央値で置き換える
train2  = train_data[["PassengerId","Survived","Sex","Age" , "Embarked" ,"SibSp" ,"Parch" ]]
test2   = test_data[ ["PassengerId"           ,"Sex","Age" , "Embarked" ,"SibSp" ,"Parch" ]]
#
age_mid=train2["Age"].median()
#print(age_mid )
print(train2.info() )
print(train2.head(10 ) )
#train2 = train2.dropna()
#train2["Embark_flg"] = train2["Embarked"].map(lambda x: str(x).replace('C', '0') )

train_sub =get_subData(train2 )
test_sub =get_subData(test2 )
print(train_sub.info() )
print(test_sub.info() )
#quit()

# 説明変数と目的変数
x_train= train_sub[["Sex_flg","Age" , "Embark_flg" ,"SibSp" ,"Parch" ]]
y_train= train_sub['Survived']
x_test = test_sub[["Sex_flg","Age" , "Embark_flg" ,"SibSp" ,"Parch" ]]

#conv
num_max_y=10
colmax_x =x_train[ "Age" ].max()
#x_train = x_train / colmax_x
#print(x_train[: 10 ])
#quit()

#print("#check-df")
#col_name="Age"
#print(x_train[ col_name ].max() )
#print(x_train[ col_name ].min() )
#quit()
#np
x_train = np.array(x_train, dtype = np.float32).reshape(len(x_train), 5)
y_train = np.array(y_train, dtype = np.float32).reshape(len(y_train), 1)
#正解ラベルをOne-Hot表現に変換
#y_train = y_train / num_max_y
#x_test  = x_test / num_max_y
y_train=np_utils.to_categorical(y_train, 2)

#


# 学習データとテストデータに分ける
print(x_train.shape, y_train.shape )
print(x_test.shape  )
#print( y_train[: 10 ])
#print(type(x_train) )
#quit()
#
# ClassNet
#network = SimpleNet(input_size=1 , hidden_size=10, output_size=1 )
global_start_time = time.time()
network = ClassNet(input_size=5 , hidden_size=10, output_size=2 )

#iters_num = 5000  # 繰り返しの回数を適宜設定する    
iters_num = 10000  # 繰り返しの回数を適宜設定する    
train_size = x_train.shape[0]
print( train_size )
#quit()
#
global_start_time = time.time()

#    batch_size = 100
batch_size = 32
learning_rate = 0.1

train_loss_list = []
train_acc_list = []
test_acc_list = []

#    iter_per_epoch = max(train_size / batch_size, 1)
iter_per_epoch =1000
#print(iter_per_epoch)
#quit()

for i in range(iters_num):
    batch_mask = np.random.choice(train_size, batch_size)
    x_batch = x_train[batch_mask]
#    quit()
    t_batch = y_train[batch_mask]
    
    # 勾配の計算
    grad = network.gradient(x_batch, t_batch)
    
    # パラメータの更新
    for key in ('W1', 'b1', 'W2', 'b2'):
        network.params[key] -= learning_rate * grad[key]
    
    loss = network.loss(x_batch, t_batch)
    train_loss_list.append(loss)
    
    if i % iter_per_epoch == 0:
        train_acc = network.accuracy(x_train, y_train)
#        test_acc  = network.accuracy(x_test, y_test)
        train_acc_list.append(train_acc)
        #test_acc_list.append(test_acc)
        #print("i=" +str(i) + ", train acc, test acc | " + str(train_acc) + ", " + str(test_acc) + " , loss=" +str(loss) )
        print("i=" +str(i) + ", train acc | " + str(train_acc) + " , loss=" +str(loss) )
        print ('time : ', time.time() - global_start_time)
        #print("train acc, test acc | " + str(train_acc) + ", " + str(test_acc))
#pred
train_acc = network.accuracy(x_train, y_train)
#test_acc  = network.accuracy(x_test, y_test)
#
#print("train acc, test acc | " + str(train_acc) + ", " + str(test_acc) + " , loss=" +str(loss) )
print("train acc | " + str(train_acc) +  " , loss=" +str(loss) )
print ('time : ', time.time() - global_start_time)
#
# パラメータの保存
network.save_params("params.pkl")
print("Saved Network Parameters!")

#print(train_acc_list[: 10])
#quit()
#plt
a1=np.arange(len(train_acc_list) )
#plt.plot(a1 , y_test *num_max_y , label = "y_test")
plt.plot(a1 , train_acc_list , label = "predict")
plt.legend()
plt.grid(True)
plt.title("price pred")
plt.xlabel("x")
plt.ylabel("price")
plt.show()

評価

# load
#network = SimpleNet(input_size=1 , hidden_size=10, output_size=1 )
network = ClassNet(input_size=5 , hidden_size=10, output_size=2 )
network.load_params("params.pkl" )


train_acc = network.accuracy(x_train, y_train)
#test_acc  = network.accuracy(x_test, y_test)
#
#print("train acc | " + str(train_acc) +  " , loss=" +str(loss) )
print("train acc | " + str(train_acc) )
print ('time : ', time.time() - global_start_time)

#
# 予測をしてCSVへ書き出す
pred = network.predict( x_test)
print(pred.shape )
outList=[]
for item in pred:
    y = np.argmax(item )
    outList.append(y )

#print(outList[: 10])
pred_y= np.array( outList )
#quit()

PassengerId = np.array( test_data["PassengerId"]).astype(int)
#df = pd.DataFrame(outList , PassengerId, columns=["Survived"])
df = pd.DataFrame(pred_y , PassengerId, columns=["Survived"])
df.head()

#
df.to_csv("out2.csv", index_label=["PassengerId"])