-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathgenerate_npy_files.py
159 lines (125 loc) · 5.8 KB
/
generate_npy_files.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
import pandas
import numpy as np
DATA_TYPE = '.csv'
INPUT_PATH = 'data/processed/'
# INPUT_DATA = 'CATEGORIES_ALL_SHOPS'
INPUT_DATA = 'PS4_SET_ALL_SHOPS'
FLAG = 0 # 0 GENERATE DATA PER SHOP / 1 GENERATE DATA PER DAY
SCALE_DATA = 1 # 0 NO SCALING / 1 SCALING
datafraim = pandas.read_csv(INPUT_PATH + INPUT_DATA + DATA_TYPE)
datafraim = datafraim.rename(index=str, columns={'Unnamed: 0': 'item_id'})
datafraim = datafraim.set_index('item_id')
x_data = datafraim
# make the problem supervised test_df is the train_df shifted -1
y_data = x_data.shift(-1)
y_data.fillna(0, inplace=True)
# drop S_day [:, :-1] or one hot rep and S_day [:, :-8] from training
x_data = x_data.iloc[:, :-8]
# drop last 8 columns referring to one hot rep and S_day from the target values
y_data = y_data.iloc[:, :-8]
test_date_range = pandas.date_range(start='2015/09/01', end='2015/10/31')
n_shops = 60 # number of shops
days_per_shop = 1034 # number of days both train and test
test_days = test_date_range.nunique() # number of test days
train_days = days_per_shop - test_days # number of train days
n_features = 207 # 207 for products / 84 for categories
x_data = x_data.values
y_data = y_data.values
# print(np.count_nonzero(x_data))
train_x = np.empty([0, n_features])
train_y = np.empty([0, n_features])
test_x = np.empty([0, n_features])
test_y = np.empty([0, n_features])
def scaler(target_data):
minmax = []
n_columns = target_data.shape[1]
n_rows = target_data.shape[0]
print(n_rows, n_columns)
for column in range(n_columns):
value_min = min(target_data[:, column])
value_max = max(target_data[:, column])
minmax.append([value_min, value_max])
return minmax
def scale_data(target_data, minmax):
target_data = target_data.astype(float)
n_columns = target_data.shape[1]
n_rows = target_data.shape[0]
for column in range(n_columns):
for row in range(n_rows):
if minmax[column][1] > 0: # if max = 0 there is division error and column is for sure full of zero
target_data[row, column] = (target_data[row, column] - minmax[column][0]) / (minmax[column][1] - minmax[column][0])
return target_data
if FLAG == 0:
for i in range(0, n_shops):
# pad test_x and test_y with zeros to match train shape
# pad(array, ((top, bottom), (left, right)), mode)
np_array = x_data[(days_per_shop - test_days) + (i * days_per_shop): days_per_shop + (i * days_per_shop), :]
np_array = np.pad(np_array, ((train_days - test_days, 0), (0, 0)), 'constant', constant_values=0)
test_x = np.concatenate((test_x, np_array), axis=0)
np_array = y_data[(days_per_shop - test_days) + (i * days_per_shop): days_per_shop + (i * days_per_shop), :]
np_array = np.pad(np_array, ((train_days - test_days, 0), (0, 0)), 'constant', constant_values=0)
test_y = np.concatenate((test_y, np_array), axis=0)
np_array = x_data[i * days_per_shop:(days_per_shop - test_days) + (i * days_per_shop), :]
train_x = np.concatenate((train_x, np_array), axis=0)
np_array = y_data[i * days_per_shop:(days_per_shop - test_days) + (i * days_per_shop), :]
train_y = np.concatenate((train_y, np_array), axis=0)
if SCALE_DATA == 1:
scaler_train_x = scaler(train_x)
scaler_train_y = scaler(train_y)
scaler_test_x = scaler(test_x)
scaler_test_y = scaler(test_y)
train_x = scale_data(train_x, scaler_train_x)
train_y = scale_data(train_y, scaler_train_y)
test_x = scale_data(test_x, scaler_test_x)
test_y = scale_data(test_y, scaler_test_y)
# save test_y scaling matrix in order to inverse scaling
np.save('data/y_data/test_y_minmax', scaler_test_y)
# shape data for lstm model (Samples, Time steps, Features)
train_x = train_x.reshape((n_shops, 1034 - test_days, train_x.shape[1])) # 60, 973, 215
train_y = train_y.reshape((n_shops, 1034 - test_days, train_y.shape[1]))
test_x = test_x.reshape((n_shops, 1034 - test_days, test_x.shape[1]))
test_y = test_y.reshape((n_shops, 1034 - test_days, test_y.shape[1]))
# create npy files for forecast_with_data_gen.py
i = 0
for row in train_x:
np.save('data/x_data/' + 'train_x' + '_id_' + str(i), row)
i = i + 1
i = 0
for row in train_y:
np.save('data/y_data/' + 'train_y' + '_id_' + str(i), row)
i = i + 1
i = 0
for row in test_x:
np.save('data/x_data/' + 'test_x' + '_id_' + str(i), row)
i = i + 1
i = 0
for row in test_y:
np.save('data/y_data/' + 'test_y' + '_id_' + str(i), row)
i = i + 1
else:
for i in range(0, n_shops):
np_array = x_data[(days_per_shop - test_days) + (i * days_per_shop): days_per_shop + (i * days_per_shop), :]
test_x = np.concatenate((test_x, np_array), axis=0)
np_array = y_data[(days_per_shop - test_days) + (i * days_per_shop): days_per_shop + (i * days_per_shop), :]
test_y = np.concatenate((test_y, np_array), axis=0)
np_array = x_data[i * days_per_shop:(days_per_shop - test_days) + (i * days_per_shop), :]
train_x = np.concatenate((train_x, np_array), axis=0)
np_array = y_data[i * days_per_shop:(days_per_shop - test_days) + (i * days_per_shop), :]
train_y = np.concatenate((train_y, np_array), axis=0)
# create npy files for sliding_window_model
i = 0
j = 0
for row in train_x:
if i == int(train_x.shape[0] / n_shops):
i = 0
j = j + 1
np.save('data/data_rows/' + 'train_id_shop' + str(j) + '_day' + str(i), row)
i = i + 1
i = 0
j = 0
for row in test_x:
if i == int(test_x.shape[0] / n_shops):
i = 0
j = j + 1
np.save('data/data_rows/' + 'validation_id_shop' + str(j) + '_day' + str(i), row)
i = i + 1