次のコンペに投稿する方法です。
COVID19 Global Forecasting (Week 3)
covid19-global-forecasting.py を covid19_apr06.ipynb に変換してアップロードしました。
covid19-global-forecasting.py
#! /usr/bin/python
# -*- coding: utf-8 -*-
#
# covid19-global-forecasting.py
#
# Apr/06/2020
# PM 14:49
# ------------------------------------------------------------------
import numpy as np
import pandas as pd
import sys
from scipy.optimize import curve_fit
np.seterr(all=None, divide=None, over=None, under=None, invalid=None)
# ------------------------------------------------------------------
def curve_fit_proc(np_train):
sys.stderr.write("*** curve_fit_proc *** start ***\n")
coefs = []
XX = np.arange(45, np_train.shape[0], 1)
for it in range(np_train.shape[1]):
sys.stderr.write("%d " % it)
if it in as_exponent:
coefs.append(curve_fit(exp, XX, np_train[45:, it],p0 = (0.5, XX[0], 2, 0), maxfev=100000)[0])
if it in as_linear:
coefs.append(curve_fit(linear, XX[10:], np_train[55:,it], p0 = (1,0,0), maxfev=100000)[0])
if it in as_sigmoid:
coefs.append(curve_fit(sigmoid, XX, np_train[45:, it] , p0 = (1, XX[0], np_train[-1, it]/2,0), maxfev=100000)[0])
#
sys.stderr.write("*** curve_fit_proc *** end ***\n")
#
return coefs
# ------------------------------------------------------------------
def pattern_proc():
as_exponent = [0, 1, 2, 3, 13, 14, 27 ,
41, 42, 44, 48, 82, 85, 92, 93, 95, 96,
100, 102, 106, 108, 110, 112, 113, 114,
122, 125, 130, 131, 132, 134, 137, 138, 139, 143, 145, 148,
149, 161, 165, 166, 172, 175, 177, 179, 185, 190, 194,
202 , 204, 205, 212, 213, 216, 218 ,
224, 225, 228, 229, 230, 231, 232, 233, 234, 235, 237, 238,
239, 241, 242, 243, 244, 246, 247, 250, 252, 256, 258, 260,
261, 264, 266, 269, 272, 273, 274, 284, 285, 286, 288, 289,
290, 291, 292, 293,
294,295,296,297,298,299,300,301,302,303,304,305]
#
as_linear = [4,5, 7, 9, 10, 17, 18, 19, 22, 24, 25, 26, 29, 30,
32, 33,34, 36, 37, 38, 39, 40, 43, 45, 46, 47, 49,
51, 53, 54, 55, 56, 57,58,60,
61,62, 63, 64, 65, 66,
67, 68, 69, 70, 71, 72,73, 74,75, 76, 77, 78,
79, 80, 81, 83, 86, 88, 89, 91, 94, 97, 98, 99,
101,103,104, 105, 107, 109, 111, 115, 116, 117, 118,
120, 121, 123, 127, 128, 129, 135, 136, 141, 142, 144,
146,147, 150, 151, 152, 153, 154, 156, 159,
160, 162, 164, 167, 169, 170, 171,174, 178,
180, 182, 183, 184, 186, 187, 188, 189,
191, 192, 195, 196, 197, 198, 200, 201, 203, 207, 208,
209, 210, 211, 214, 217, 219,
220, 221, 222, 226, 236, 240, 245,
251, 254, 257, 259, 262, 263, 265,
267, 268, 270, 271, 277, 278, 279,
280, 281, 282, 283, 287]
#
as_sigmoid = [6, 8, 11, 12, 15, 16, 20, 21, 23, 28,
31, 35, 50, 52, 59, 84, 87, 90 ,
119, 124, 126, 133, 140, 155, 157, 158,
163, 168, 173, 176, 181,
193, 199, 206, 215, 223, 227, 248, 249, 253, 255, 276, 275]
#
return as_exponent,as_linear,as_sigmoid
# ------------------------------------------------------------------
def exp(x, a, b, d, p):
rvalue = 0.0
try:
rvalue = d * np.exp(a * x - b) + p
except Exception as ee:
sys.stderr.write("*** error *** in exp ***\n")
sys.stderr.write(str(ee) + "\n")
return rvalue
#
# ------------------------------------------------------------------
def linear(xx, aa, bb, cc):
return aa*(xx-bb)+cc
# ------------------------------------------------------------------
def sigmoid(xx, aa, bb, dd, pp):
rvalue = 0.0
vv = (aa*xx-bb)
try:
# if 0 <= vv:
zz = np.exp(-vv)
rvalue = dd/(1 + zz) + pp
# else:
# zz = np.exp(vv)
# rvalue = dd * zz/(1 + zz) + pp
except Exception as ee:
sys.stderr.write("*** error *** in sigmoid ***\n")
sys.stderr.write(str(ee) + "\n")
sys.exit(1)
return rvalue
# ------------------------------------------------------------------
def new_linear(x, a, b, c):
return (a*(x-b)+c)*(a*(x-b)+c>=0)
# ------------------------------------------------------------------
def new_sigmoid(x, a, b, d, p):
return sigmoid(x, a, b, d, p)*(sigmoid(x, a, b, d, p)>=0)
# ------------------------------------------------------------------
def new_epx(x, a, b, d, p):
return exp(x, a, b, d, p)*(exp(x, a, b, d, p)>=0)
# ------------------------------------------------------------------
def RMSLE(pred,actual):
return np.sqrt(np.mean(np.power((np.log(pred+1)-np.log(actual+1)),2)))
#
# ------------------------------------------------------------------
sys.stderr.write("*** start ***\n")
folder_src='../input/covid19-global-forecasting-week-3'
data = pd.read_csv(folder_src + '/train.csv')
test_data = pd.read_csv(folder_src + '/test.csv')
submission = pd.read_csv(folder_src + '/submission.csv')
print(data.shape)
print(test_data.shape)
print(submission.shape)
# ------------------------------------------------------------------
data.loc[data['Country_Region'] == 'Japan']
data["Date"] = data["Date"].apply(lambda x: x.replace("-",""))
data["Date"] = data["Date"].astype(int)
test_data["Date"] = test_data["Date"].apply(lambda x: x.replace("-",""))
test_data["Date"] = test_data["Date"].astype(int)
data['key'] = data['Country_Region'].astype('str') + " " + data['Province_State'].astype('str')
test_data['key'] = test_data['Country_Region'].astype('str') + " " + test_data['Province_State'].astype('str')
data_train = data
# ------------------------------------------------------------------
date_xx = 20200326
nn_unique = data_train['key'].nunique()
#last day in test data
test_last_day = int(test_data.shape[0]/nn_unique) \
+ int(data_train[data_train.Date< date_xx].shape[0]/nn_unique)
sys.stderr.write("nn_unique = %d\n" % nn_unique)
dtemp = data_train['key'].unique()
print(len(dtemp))
#for it in [0,54,76,227]:
# print("dtemp[%d] = %s" % (it, dtemp[it]))
print("test_last_day = ",test_last_day)
# ------------------------------------------------------------------
#days in test data
test_days = np.arange(int(data_train[data_train.Date< date_xx].shape[0]/nn_unique), test_last_day, 1)
print(test_days)
# ------------------------------------------------------------------
sys.stderr.write("*** check *** ccc ***\n")
#lets create pivot tables
pivot_train = pd.pivot_table(data_train, index='Date', columns = 'key', values = 'ConfirmedCases')
pivot_train_d = pd.pivot_table(data_train, index='Date', columns = 'key', values = 'Fatalities')
np_train = pivot_train.to_numpy()
np_train_d = pivot_train_d.to_numpy()
sys.stderr.write("*** check *** eee ***\n")
data_train[['ConfirmedCases', 'Fatalities']].corr()
pivot_train.head(10)
# ------------------------------------------------------------------
shift = [0,1,2,3,4,5,6,7]
for s in shift:
sum = 0
for i in range(1,20):
sum += np.abs((np_train_d[-i][:]/(np_train[-i-s][:]+0.0001)-np_train_d[-i-1][:]/(np_train[-i-1-s][:]+0.0001)).mean())
print(sum, s)
sys.stderr.write("*** check *** ggg ***\n")
# ------------------------------------------------------------------
mask_deaths = np.zeros_like(np_train[0])
for i in range(1,21):
mask_deaths += np_train_d[-i]/(np_train[-i]+0.0001)
mask_deaths = mask_deaths/20
mask_deaths[(mask_deaths < 0.5) & (mask_deaths!=0)].mean()
mask_deaths[(mask_deaths> 0.5)|(mask_deaths<0.005)] = mask_deaths[(mask_deaths < 0.5) & (mask_deaths!=0)].mean()
sys.stderr.write("*** check *** hhh ***\n")
mask_mesh = np.meshgrid(mask_deaths, test_days)[0].T.flatten()
assert mask_mesh.shape[0] == test_data.shape[0]
# ------------------------------------------------------------------
as_exponent,as_linear,as_sigmoid = pattern_proc()
# ------------------------------------------------------------------
set(as_sigmoid)&set(as_linear) | set(as_sigmoid)&set(as_exponent) | set(as_exponent)&set(as_linear)
sys.stderr.write("*** check *** iii ***\n")
# ------------------------------------------------------------------
np_train.shape[0]
coefs = curve_fit_proc(np_train)
# ------------------------------------------------------------------
# import matplotlib.pyplot as plt
# for i in as_linear:
# plt.plot(np_train[45:,i], label = str(i))
# plt.plot(linear(X, *coefs[i]))
# plt.legend()
# plt.show()
# ------------------------------------------------------------------
ConfirmedCases_test = np.zeros((nn_unique, test_days.shape[0]))
# ------------------------------------------------------------------
sys.stderr.write("*** check *** jjj ***\n")
for it in range(np_train.shape[1]):
if it in as_exponent:
function = new_epx
if it in as_linear:
function = new_linear
if it in as_sigmoid:
function = new_sigmoid
ConfirmedCases_test[it] = function(test_days, *coefs[it])
ConfirmedCases_test.flatten().shape[0]
sys.stderr.write("*** check *** kkk ***\n")
assert ConfirmedCases_test.flatten().shape[0] == test_data.shape[0]
# ------------------------------------------------------------------
test_data['predict'] = ConfirmedCases_test.flatten()
# test_data[test_data['Country_Region']=='Japan']
submission['ConfirmedCases'] = ConfirmedCases_test.flatten()
submission['Fatalities'] = ConfirmedCases_test.flatten()*mask_mesh
submission.to_csv('submission.csv', index=False)
sys.stderr.write("*** end ***\n")
# ------------------------------------------------------------------
実行時に次の CSV ファイルが必要です。これらは、コンペのページからダウンロードできます。
$ ls ../input/covid19-global-forecasting-week-3/
submission.csv test.csv train.csv
ノートブックに変換する方法です。
ipynb-py-convert covid19-global-forecasting.py covid19_apr06.ipynb