GluonTSを使ってみる (2)-Bike Sharing Dataset-

archive.ics.uci.edu
「Bike Sharing Dataset Data Set」で値を予測する。

  • use no features

f:id:touch-sp:20190914174955p:plain

  • use 'feat_dynamic_real' and 'feat_dynamic_cat'

f:id:touch-sp:20190914175009p:plain

コード

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

df = pd.read_csv('day.csv',index_col=1)

feat1 = np.array(df.hum).reshape((1,-1))
feat2 = np.array(df.temp).reshape((1,-1))
feat3 = np.array(df.windspeed).reshape((1,-1))
features_real = np.concatenate([feat1, feat2, feat3], axis=0)

feat4 = np.array(df.weekday).reshape((1,-1))
feat5 = np.array(df.workingday).reshape((1,-1))
feat6 = np.array(df.weathersit-1).reshape((1,-1))
feat7 = np.array(df.season-1).reshape((1,-1))
features_cat = np.concatenate([feat4, feat5, feat6, feat7], axis=0)

from gluonts.dataset.common import ListDataset

training_data = ListDataset(
    [{"start": df.index[0], 
        "target": df.cnt[:-14],
        "feat_dynamic_real": features_real[:,:-14],
        "feat_dynamic_cat": features_cat[:,:-14]
        }],
    freq = "1D")

test_data = ListDataset(
    [{"start": df.index[0], 
        "target": df.cnt,
        'feat_dynamic_real': features_real,
        "feat_dynamic_cat": features_cat
        }],
    freq = "1D")

from gluonts.model.deepar import DeepAREstimator
from gluonts.trainer import Trainer
#次の prediction_length 値を、先行して与えられた context_length 値から予測
estimator = DeepAREstimator(freq="1D", 
                            prediction_length=14, 
                            context_length=28,
                            use_feat_dynamic_real = True,
                            trainer=Trainer(epochs=50))
predictor = estimator.train(training_data=training_data)

from gluonts.evaluation.backtest import make_evaluation_predictions
from gluonts.dataset.util import to_pandas

forecast_it, ts_it = make_evaluation_predictions(
    dataset=test_data,  # test dataset
    predictor=predictor,  # predictor
    num_eval_samples=100,  # number of sample paths we want for evaluation
)

plot_length = 30
prediction_intervals = (50.0, 90.0)
legend = ["observations", "median prediction"] + [f"{k}% prediction interval" for k in prediction_intervals][::-1]

for x, y in zip(test_data, forecast_it):
    to_pandas(x)[-plot_length:].plot()
    y.plot(color='g', prediction_intervals=prediction_intervals)
    plt.grid(which='both')
    plt.legend(legend, loc='upper left')

plt.show()

問題点

非常にうまくいっているようにみえるがこれにはからくりがある。
将来の2週間を予測する際に、将来の「天候」「気温」「湿度」「風速」などをモデルに提供している。
正確にわかるはずのない将来データを使用している点で問題あり。

疑問

「feat_dynamic_cat」は利用されているのか?
なくても同様の結果になる。
試しにすべての「features」を「feat_dynamic_real」にいれてみたところ一番良い結果が得られた。

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

def one_hot(x, start_zero = True):
    if not start_zero:
        x = x-1
    num = len(x) 
    category_n = x.max() + 1

    vec_list = []
    for i in range(category_n):
        vec_list.append(np.zeros(num))
    
    for i, cat in enumerate(x):
        vec_list[cat][i] = 1
    
    vec_list = [x.reshape((1,-1)) for x in vec_list]

    return np.concatenate(vec_list, axis=0)

df = pd.read_csv('day.csv',index_col=1)

feat1 = np.array(df.hum).reshape((1,-1))
feat2 = np.array(df.temp).reshape((1,-1))
feat3 = np.array(df.windspeed).reshape((1,-1))

feat4 = one_hot(df.weekday, start_zero=True)
feat5 = np.array(df.workingday).reshape((1,-1))
feat6 = one_hot(df.weathersit, start_zero=False)
feat7 = one_hot(df.season, start_zero=False)

features_real = np.concatenate([feat1, feat2, feat3, feat4, feat5, feat6, feat7], axis=0)

from gluonts.dataset.common import ListDataset

training_data = ListDataset(
    [{"start": df.index[0], 
        "target": df.cnt[:-14],
        "feat_dynamic_real": features_real[:,:-14],
        #"feat_dynamic_cat": features_cat[:,:-14]
        }],
    freq = "1D")

test_data = ListDataset(
    [{"start": df.index[0], 
        "target": df.cnt,
        'feat_dynamic_real': features_real,
        #"feat_dynamic_cat": features_cat
        }],
    freq = "1D")

from gluonts.model.deepar import DeepAREstimator
from gluonts.trainer import Trainer
#次の prediction_length 値を、先行して与えられた context_length 値から予測
estimator = DeepAREstimator(freq="1D", 
                            prediction_length=14, 
                            context_length=28,
                            use_feat_dynamic_real = True,
                            trainer=Trainer(epochs=50))
predictor = estimator.train(training_data=training_data)

from gluonts.evaluation.backtest import make_evaluation_predictions
from gluonts.dataset.util import to_pandas

forecast_it, ts_it = make_evaluation_predictions(
    dataset=test_data,  # test dataset
    predictor=predictor,  # predictor
    num_eval_samples=100,  # number of sample paths we want for evaluation
)

plot_length = 30
prediction_intervals = (50.0, 90.0)
legend = ["observations", "median prediction"] + [f"{k}% prediction interval" for k in prediction_intervals][::-1]

for x, y in zip(test_data, forecast_it):
    to_pandas(x)[-plot_length:].plot()
    y.plot(color='g', prediction_intervals=prediction_intervals)
    plt.grid(which='both')
    plt.legend(legend, loc='upper left')

plt.show()

f:id:touch-sp:20190914230037p:plain

2019年10月9日追記

one_hotメソッドは以下の様にシンプルにできる。

def one_hot(x, start_zero = True):

    if not start_zero:
        x = x-1 
    category_n = x.max() + 1
    one_hot_vec = np.identity(category_n)[x]

    return one_hot_vec.transpose(1,0)