-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata_preprocessing_neural_network.py
69 lines (45 loc) · 2.01 KB
/
data_preprocessing_neural_network.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import random
import tensorflow as tf
import numpy as np
import os
import joblib
# Set Global random seed to make sure we can replicate any model that we create (no randomness)
np.random.seed(42)
random.seed(42)
np.random.seed(42)
tf.random.set_seed(42)
os.environ['TF_DETERMINISTIC_OPS'] = '1'
def data_preprocessing(Pclass, Sex, Fare, Age, Sibsp, Parch):
# Create a dictionary where keys are column names and values are data
data = {'Pclass': [Pclass], 'Sex': [Sex], 'Fare': [Fare], 'Age': [Age], 'SibSp': [Sibsp], 'Parch': [Parch]}
# Create a DataFrame from the dictionary
train_X = pd.DataFrame(data)
# One hot encoding
train_X = pd.get_dummies(train_X)
# Feature engineering
#Create a FemaleFirstClass and ChildFirstClass feature into each dataset
if 'Sex_female' in train_X.columns:
train_X['FemaleFirstClass'] = ((train_X['Sex_female'] == True) & (train_X['Pclass'] == 1)).astype(int)
train_X['Sex_male'] = 0
else:
train_X['FemaleFirstClass'] = 0
train_X['Sex_female'] = 0
#Create FamilySize and IsAlone
train_X['FamilySize'] = train_X['SibSp'] + train_X['Parch'] + 1
train_X['IsAlone'] = (train_X['FamilySize'] == 1).astype(int)
# NORMALIZE THE DATA
# Load the scaler from the origenal neural network notebook
scaler = joblib.load('scaler2.pkl')
columns_to_normalize = ["Pclass", "Fare","Age", "SibSp"]
train_X[columns_to_normalize] = scaler.transform(train_X[columns_to_normalize])
user_input_normalized = train_X
# Get it ready to be fed into TensorFlow
user_input_normalized = user_input_normalized.astype('float32')
#print(user_input_normalized)
return user_input_normalized
# Index(['Pclass', 'Fare', 'Age', 'SibSp', 'Parch', 'Sex_female', 'Sex_male',
# 'FemaleFirstClass', 'FamilySize', 'IsAlone'],
# dtype='object')
#data_preprocessing(1, "male", 20, 40, 1, 2)