More than 5 years have passed since last update.

Kaggle COVID19 Global Forecasting (Week 3)

Last updated at 2020-04-06Posted at 2020-04-06

次のコンペに投稿する方法です。

COVID19 Global Forecasting (Week 3)
covid19-global-forecasting.py を covid19_apr06.ipynb に変換してアップロードしました。

covid19-global-forecasting.py

# ! /usr/bin/python
# -*- coding: utf-8 -*-
#
#	covid19-global-forecasting.py
#
#						Apr/06/2020
#							PM 14:49
# ------------------------------------------------------------------
import numpy as np
import pandas as pd
import sys
from scipy.optimize import curve_fit

np.seterr(all=None, divide=None, over=None, under=None, invalid=None)
# ------------------------------------------------------------------
def curve_fit_proc(np_train):
	sys.stderr.write("*** curve_fit_proc *** start ***\n")
	coefs = []

	XX = np.arange(45, np_train.shape[0], 1)

	for it in range(np_train.shape[1]):
		sys.stderr.write("%d " % it)
		if it in as_exponent:
			coefs.append(curve_fit(exp,  XX, np_train[45:, it],p0 = (0.5, XX[0], 2, 0), maxfev=100000)[0])
		if it in as_linear:
			coefs.append(curve_fit(linear,  XX[10:], np_train[55:,it], p0 = (1,0,0), maxfev=100000)[0])
		if it in as_sigmoid:
			coefs.append(curve_fit(sigmoid,  XX, np_train[45:, it] , p0 = (1, XX[0], np_train[-1, it]/2,0), maxfev=100000)[0])
#
	sys.stderr.write("*** curve_fit_proc *** end ***\n")
#
	return coefs
# ------------------------------------------------------------------
def pattern_proc():
	as_exponent = [0, 1, 2, 3, 13, 14, 27 ,
		41, 42, 44, 48,    82, 85, 92, 93, 95, 96,
		100, 102, 106, 108, 110, 112, 113, 114,
		122, 125, 130, 131, 132, 134, 137, 138, 139, 143, 145, 148,
		149, 161, 165, 166, 172, 175, 177, 179, 185, 190, 194,
		202 , 204, 205, 212, 213, 216, 218 ,
		224, 225, 228, 229, 230, 231, 232, 233, 234, 235, 237, 238,
		239, 241, 242, 243, 244, 246, 247, 250, 252, 256, 258, 260,
		261, 264, 266, 269, 272, 273, 274, 284, 285, 286, 288, 289,
		290, 291, 292, 293,
		294,295,296,297,298,299,300,301,302,303,304,305]
#
	as_linear = [4,5, 7, 9, 10, 17, 18, 19, 22, 24, 25, 26, 29, 30,
		32, 33,34, 36, 37, 38, 39, 40, 43, 45, 46, 47, 49,
		51, 53, 54,  55, 56, 57,58,60,
		61,62, 63, 64, 65, 66,
		67, 68, 69, 70, 71, 72,73, 74,75, 76, 77, 78,
		79, 80, 81, 83, 86, 88, 89, 91, 94, 97, 98, 99,
		101,103,104, 105, 107, 109, 111, 115, 116, 117, 118,
		120, 121, 123, 127, 128, 129, 135, 136, 141, 142, 144,
		146,147,  150, 151, 152, 153, 154, 156, 159,
		160, 162, 164, 167, 169, 170, 171,174, 178,
		180, 182, 183, 184, 186, 187, 188, 189,
		191, 192, 195, 196, 197, 198, 200, 201, 203, 207, 208,
		209, 210, 211, 214, 217, 219,
		220, 221, 222, 226,   236, 240, 245,
		251, 254, 257, 259, 262, 263, 265, 
		267, 268, 270, 271, 277, 278, 279,
		280, 281, 282, 283, 287]
#
	as_sigmoid = [6, 8, 11, 12, 15, 16, 20, 21, 23, 28,
		31, 35, 50, 52, 59, 84, 87, 90 ,
		119, 124, 126, 133, 140, 155, 157, 158,
		163, 168, 173, 176, 181,
		193, 199, 206, 215, 223, 227, 248, 249, 253, 255, 276, 275]
#
	return as_exponent,as_linear,as_sigmoid
# ------------------------------------------------------------------
def exp(x, a, b, d, p):
	rvalue = 0.0
	try:
		rvalue = d * np.exp(a * x - b) + p
	except Exception as ee:
		sys.stderr.write("*** error *** in exp ***\n")
		sys.stderr.write(str(ee) + "\n")
	return rvalue
#
# ------------------------------------------------------------------
def linear(xx, aa, bb, cc):
	return aa*(xx-bb)+cc

# ------------------------------------------------------------------
def sigmoid(xx, aa, bb, dd, pp):
	rvalue = 0.0
	vv = (aa*xx-bb)
	try:
#		if 0 <= vv:
		zz = np.exp(-vv)
		rvalue = dd/(1 + zz) + pp
#		else:
#			zz = np.exp(vv)
#			rvalue = dd * zz/(1 + zz) + pp
	except Exception as ee:
		sys.stderr.write("*** error *** in sigmoid ***\n")
		sys.stderr.write(str(ee) + "\n")
		sys.exit(1)
	return rvalue
# ------------------------------------------------------------------
def new_linear(x, a, b, c):
	return (a*(x-b)+c)*(a*(x-b)+c>=0)

# ------------------------------------------------------------------
def new_sigmoid(x, a, b, d, p):
	return sigmoid(x, a, b, d, p)*(sigmoid(x, a, b, d, p)>=0)

# ------------------------------------------------------------------
def new_epx(x, a, b, d, p):
	return exp(x, a, b, d, p)*(exp(x, a, b, d, p)>=0)

# ------------------------------------------------------------------
def RMSLE(pred,actual):
	return np.sqrt(np.mean(np.power((np.log(pred+1)-np.log(actual+1)),2)))
#
# ------------------------------------------------------------------
sys.stderr.write("*** start ***\n")

folder_src='../input/covid19-global-forecasting-week-3'
data = pd.read_csv(folder_src + '/train.csv')
test_data = pd.read_csv(folder_src + '/test.csv')
submission = pd.read_csv(folder_src + '/submission.csv')
print(data.shape)
print(test_data.shape)
print(submission.shape)

# ------------------------------------------------------------------

data.loc[data['Country_Region'] == 'Japan']

data["Date"] = data["Date"].apply(lambda x: x.replace("-",""))
data["Date"]  = data["Date"].astype(int)
test_data["Date"] = test_data["Date"].apply(lambda x: x.replace("-",""))
test_data["Date"]  = test_data["Date"].astype(int)
data['key'] = data['Country_Region'].astype('str') + " " + data['Province_State'].astype('str')
test_data['key'] = test_data['Country_Region'].astype('str') + " " + test_data['Province_State'].astype('str')
data_train = data

# ------------------------------------------------------------------
date_xx = 20200326
nn_unique = data_train['key'].nunique()
# last day in test data
test_last_day = int(test_data.shape[0]/nn_unique) \
	+ int(data_train[data_train.Date< date_xx].shape[0]/nn_unique)

sys.stderr.write("nn_unique = %d\n" % nn_unique)
dtemp = data_train['key'].unique()
print(len(dtemp))
# for it in [0,54,76,227]:
#	print("dtemp[%d] = %s" % (it, dtemp[it]))

print("test_last_day = ",test_last_day)

# ------------------------------------------------------------------

# days in test data
test_days = np.arange(int(data_train[data_train.Date< date_xx].shape[0]/nn_unique), test_last_day, 1)

print(test_days)

# ------------------------------------------------------------------
sys.stderr.write("*** check *** ccc ***\n")

# lets create pivot tables
pivot_train = pd.pivot_table(data_train, index='Date', columns = 'key', values = 'ConfirmedCases')
pivot_train_d = pd.pivot_table(data_train, index='Date', columns = 'key', values = 'Fatalities')
np_train = pivot_train.to_numpy()
np_train_d = pivot_train_d.to_numpy()

sys.stderr.write("*** check *** eee ***\n")

data_train[['ConfirmedCases', 'Fatalities']].corr()

pivot_train.head(10)

# ------------------------------------------------------------------

shift = [0,1,2,3,4,5,6,7]
for s in shift:
    sum = 0
    for i in range(1,20):
        sum += np.abs((np_train_d[-i][:]/(np_train[-i-s][:]+0.0001)-np_train_d[-i-1][:]/(np_train[-i-1-s][:]+0.0001)).mean())
    print(sum, s)

sys.stderr.write("*** check *** ggg ***\n")

# ------------------------------------------------------------------
mask_deaths = np.zeros_like(np_train[0])
for i in range(1,21):
    mask_deaths += np_train_d[-i]/(np_train[-i]+0.0001)
mask_deaths = mask_deaths/20   

mask_deaths[(mask_deaths < 0.5) & (mask_deaths!=0)].mean()

mask_deaths[(mask_deaths> 0.5)|(mask_deaths<0.005)] = mask_deaths[(mask_deaths < 0.5) & (mask_deaths!=0)].mean()

sys.stderr.write("*** check *** hhh ***\n")

mask_mesh = np.meshgrid(mask_deaths, test_days)[0].T.flatten()

assert mask_mesh.shape[0] == test_data.shape[0]

# ------------------------------------------------------------------
as_exponent,as_linear,as_sigmoid = pattern_proc()

# ------------------------------------------------------------------

set(as_sigmoid)&set(as_linear) | set(as_sigmoid)&set(as_exponent) | set(as_exponent)&set(as_linear)

sys.stderr.write("*** check *** iii ***\n")

# ------------------------------------------------------------------
np_train.shape[0]

coefs = curve_fit_proc(np_train)

# ------------------------------------------------------------------
# import matplotlib.pyplot as plt
# for i in as_linear:
#    plt.plot(np_train[45:,i], label = str(i))
#    plt.plot(linear(X, *coefs[i]))
#    plt.legend()
#    plt.show()

# ------------------------------------------------------------------

ConfirmedCases_test = np.zeros((nn_unique, test_days.shape[0]))

# ------------------------------------------------------------------

sys.stderr.write("*** check *** jjj ***\n")

for it in range(np_train.shape[1]):
    if it in as_exponent:
        function = new_epx
    if it in as_linear:
        function = new_linear
    if it in as_sigmoid:
        function = new_sigmoid
    ConfirmedCases_test[it] = function(test_days, *coefs[it])

ConfirmedCases_test.flatten().shape[0]

sys.stderr.write("*** check *** kkk ***\n")

assert ConfirmedCases_test.flatten().shape[0] == test_data.shape[0]

# ------------------------------------------------------------------
test_data['predict'] = ConfirmedCases_test.flatten()

# test_data[test_data['Country_Region']=='Japan']

submission['ConfirmedCases'] = ConfirmedCases_test.flatten()
submission['Fatalities'] = ConfirmedCases_test.flatten()*mask_mesh
submission.to_csv('submission.csv', index=False)

sys.stderr.write("*** end ***\n")
# ------------------------------------------------------------------

実行時に次の CSV ファイルが必要です。これらは、コンペのページからダウンロードできます。

$ ls ../input/covid19-global-forecasting-week-3/
submission.csv	test.csv  train.csv

ノートブックに変換する方法です。

ipynb-py-convert  covid19-global-forecasting.py covid19_apr06.ipynb

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up