本記事について
学習済みの重み付きのモデルを
PyTorchを用いて、 onnx 化し、
onnxのモデルを、TensorRT 7 で最適化を行うサンプルとなっています。
TensorRT とは
TensorRT は NVIDIAが提供するDeep Learning model の Runtime で、NVIDIA GPU上で高速に推論が可能です。
他のDeep Learning Frameworkで学習済みモデルを読み込んで推論が可能です。
Throughput の向上と レイテンシの削減
今回のサンプル
- PyTorch でモデルを作成
- onnx に変換
- TensorRTにonnxを読み込み
- 最適化
- GPU上での推論
環境について
環境依存で進めなくなる可能性が高いので、 nvidia が公開している pytorch の container Image を利用します。
Docker 上でGPUを使えるようにする
NVIDIA Container Toolkit を利用して コンテナ起動時に --gpu
オプションを付けることで、GPUを利用可能にします。
distribution=$(. /etc/os-release;echo $ID$VERSION_ID)
curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add -
curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.list | sudo tee /etc/apt/sources.list.d/nvidia-docker.list
sudo apt update && sudo apt install -y nvidia-container-toolkit
sudo systemctl restart docker
# インストールの確認
nvidia-container-cli info
# 動作確認
docker run --gpus all ubuntu nvidia-smi
pytorch の container Image を pull
docker pull nvcr.io/nvidia/pytorch:20.07-py3
docker run --gpus all -it nvcr.io/nvidia/pytorch:20.07-py3
PyTorch のモデルを onnx に変換
onnx のインストール
まずは、onnx のインストールを行います。
apt-get install libprotobuf-dev protobuf-compiler # protobuf がインストールに必要
git clone --recursive https://github.com/onnx/onnx.git
cd onnx
mkdir build && cd build
cmake ..
make -j $(nproc)
make install
cd ../..
アップデート
apt-get update
apt-get upgrade
TensorRT C++ のコンパイル
git clone https://github.com/parallel-forall/code-samples.git
cd code-samples/posts/TensorRT-introduction
make clean && make # Compile the TensorRT C++ code
cd ..
onnxモデルの作成
wget https://developer.download.nvidia.com/devblogs/speeding-up-unet.7z
apt-get install p7zip-full
7z x speeding-up-unet.7z
cd unet
python create_network.py
import torch
from torch.autograd import Variable
import torch.onnx as torch_onnx
import onnx
def main():
input_shape = (3, 256, 256)
model_onnx_path = "unet.onnx"
dummy_input = Variable(torch.randn(1, *input_shape))
model = torch.hub.load('mateuszbuda/brain-segmentation-pytorch', 'unet',
in_channels=3, out_channels=1, init_features=32, pretrained=True)
model.train(False)
inputs = ['input.1']
outputs = ['186']
dynamic_axes = {'input.1': {0: 'batch'}, '186':{0:'batch'}}
out = torch.onnx.export(model, dummy_input, model_onnx_path, input_names=inputs, output_names=outputs, dynamic_axes=dynamic_axes)
if __name__=='__main__':
main()
Kaggle API経由でのテスト用データセットのダウンロード
KeyをKaggleからダウンロード
https://www.kaggle.com/your_username/account
内の Create API Token を押して、json ファイルをダウンロード
好きな場所に配置
~/.kaggle/kaggle.json
pip install kaggle
export KAGGLE_USERNAME=username
export KAGGLE_KEY=<~/.kaggle/kaggle.jsonに記載されている KEY>
kaggle datasets list # list が出ることを確認
kaggle mateuszbuda/lgg-mri-segmentation
unzip lgg-mri-segmentation.zip
データセットの準備
pip install medpy
vi utils.py
以下の utils.pyを同じフォルダ内に作成
import numpy as np
from medpy.filter.binary import largest_connected_component
from skimage.exposure import rescale_intensity
from skimage.transform import resize
def dsc(y_pred, y_true, lcc=True):
if lcc and np.any(y_pred):
y_pred = np.round(y_pred).astype(int)
y_true = np.round(y_true).astype(int)
y_pred = largest_connected_component(y_pred)
return np.sum(y_pred[y_true == 1]) * 2.0 / (np.sum(y_pred) + np.sum(y_true))
def crop_sample(x):
volume, mask = x
volume[volume < np.max(volume) * 0.1] = 0
z_projection = np.max(np.max(np.max(volume, axis=-1), axis=-1), axis=-1)
z_nonzero = np.nonzero(z_projection)
z_min = np.min(z_nonzero)
z_max = np.max(z_nonzero) + 1
y_projection = np.max(np.max(np.max(volume, axis=0), axis=-1), axis=-1)
y_nonzero = np.nonzero(y_projection)
y_min = np.min(y_nonzero)
y_max = np.max(y_nonzero) + 1
x_projection = np.max(np.max(np.max(volume, axis=0), axis=0), axis=-1)
x_nonzero = np.nonzero(x_projection)
x_min = np.min(x_nonzero)
x_max = np.max(x_nonzero) + 1
return (
volume[z_min:z_max, y_min:y_max, x_min:x_max],
mask[z_min:z_max, y_min:y_max, x_min:x_max],
)
def pad_sample(x):
volume, mask = x
a = volume.shape[1]
b = volume.shape[2]
if a == b:
return volume, mask
diff = (max(a, b) - min(a, b)) / 2.0
if a > b:
padding = ((0, 0), (0, 0), (int(np.floor(diff)), int(np.ceil(diff))))
else:
padding = ((0, 0), (int(np.floor(diff)), int(np.ceil(diff))), (0, 0))
mask = np.pad(mask, padding, mode="constant", constant_values=0)
padding = padding + ((0, 0),)
volume = np.pad(volume, padding, mode="constant", constant_values=0)
return volume, mask
def resize_sample(x, size=256):
volume, mask = x
v_shape = volume.shape
out_shape = (v_shape[0], size, size)
mask = resize(
mask,
output_shape=out_shape,
order=0,
mode="constant",
cval=0,
anti_aliasing=False,
)
out_shape = out_shape + (v_shape[3],)
volume = resize(
volume,
output_shape=out_shape,
order=2,
mode="constant",
cval=0,
anti_aliasing=False,
)
return volume, mask
def normalize_volume(volume):
p10 = np.percentile(volume, 10)
p99 = np.percentile(volume, 99)
volume = rescale_intensity(volume, in_range=(p10, p99))
m = np.mean(volume, axis=(0, 1, 2))
s = np.std(volume, axis=(0, 1, 2))
volume = (volume - m) / s
return volume
def log_images(x, y_true, y_pred, channel=1):
images = []
x_np = x[:, channel].cpu().numpy()
y_true_np = y_true[:, 0].cpu().numpy()
y_pred_np = y_pred[:, 0].cpu().numpy()
for i in range(x_np.shape[0]):
image = gray2rgb(np.squeeze(x_np[i]))
image = outline(image, y_pred_np[i], color=[255, 0, 0])
image = outline(image, y_true_np[i], color=[0, 255, 0])
images.append(image)
return images
def gray2rgb(image):
w, h = image.shape
image += np.abs(np.min(image))
image_max = np.abs(np.max(image))
if image_max > 0:
image /= image_max
ret = np.empty((w, h, 3), dtype=np.uint8)
ret[:, :, 2] = ret[:, :, 1] = ret[:, :, 0] = image * 255
return ret
def outline(image, mask, color):
mask = np.round(mask)
yy, xx = np.nonzero(mask)
for y, x in zip(yy, xx):
if 0.0 < np.mean(mask[max(0, y - 1) : y + 2, max(0, x - 1) : x + 2]) < 1.0:
image[max(0, y) : y + 1, max(0, x) : x + 1] = color
return image
3枚テスト用画像を作成
input_image には任意の画像を指定
mkdir test_data_set_0
mkdir test_data_set_1
mkdir test_data_set_2
python prepareData.py --input_image lgg-mri-segmentation/kaggle_3m/TCGA_DU_5849_19950405/TCGA_DU_5849_19950405_23.tif --input_tensor test_data_set_0/input_0.pb --output_tensor test_data_set_0/output_0.pb
python prepareData.py --input_image lgg-mri-segmentation/kaggle_3m/TCGA_DU_5849_19950405/TCGA_DU_5849_19950405_24.tif --input_tensor test_data_set_1/input_0.pb --output_tensor test_data_set_1/output_0.pb
python prepareData.py --input_image lgg-mri-segmentation/kaggle_3m/TCGA_DU_5849_19950405/TCGA_DU_5849_19950405_25.tif --input_tensor test_data_set_2/input_0.pb --output_tensor test_data_set_2/output_0.pb # This creates input_0.pb and output_0.pb
最適化 / 推論を実行
cd /workspace/code-samples/posts/TensorRT-introduction
./simpleOnnx_1 unet/unet.onnx unet/test_data_set_0/input_0.pb
# include <NvInfer.h>
# include "cudaWrapper.h"
# include "ioHelper.h"
# include <NvOnnxParser.h>
# include <algorithm>
# include <cassert>
# include <iostream>
# include <memory>
# include <string>
# include <vector>
# include <numeric>
# include <math.h>
# include <cmath>
using namespace nvinfer1;
using namespace std;
using namespace cudawrapper;
static Logger gLogger;
// Maxmimum absolute tolerance for output tensor comparison against reference.
constexpr double ABS_EPSILON = 0.005;
// Maxmimum relative tolerance for output tensor comparison against reference.
constexpr double REL_EPSILON = 0.05;
nvinfer1::ICudaEngine* createCudaEngine(string const& onnxModelPath, int batchSize)
{
const auto explicitBatch = 1U << static_cast<uint32_t>(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
unique_ptr<nvinfer1::IBuilder, Destroy<nvinfer1::IBuilder>> builder{nvinfer1::createInferBuilder(gLogger)};
unique_ptr<nvinfer1::INetworkDefinition, Destroy<nvinfer1::INetworkDefinition>> network{builder->createNetworkV2(explicitBatch)};
unique_ptr<nvonnxparser::IParser, Destroy<nvonnxparser::IParser>> parser{nvonnxparser::createParser(*network, gLogger)};
unique_ptr<nvinfer1::IBuilderConfig,Destroy<nvinfer1::IBuilderConfig>> config{builder->createBuilderConfig()};
if (!parser->parseFromFile(onnxModelPath.c_str(), static_cast<int>(ILogger::Severity::kINFO)))
{
cout << "ERROR: could not parse input engine." << endl;
return nullptr;
}
builder->setMaxBatchSize(batchSize);
config->setMaxWorkspaceSize((1 << 30));
auto profile = builder->createOptimizationProfile();
profile->setDimensions(network->getInput(0)->getName(), OptProfileSelector::kMIN, Dims4{1, 3, 256 , 256});
profile->setDimensions(network->getInput(0)->getName(), OptProfileSelector::kOPT, Dims4{1, 3, 256 , 256});
profile->setDimensions(network->getInput(0)->getName(), OptProfileSelector::kMAX, Dims4{32, 3, 256 , 256});
config->addOptimizationProfile(profile);
return builder->buildEngineWithConfig(*network, *config);
}
static int getBindingInputIndex(nvinfer1::IExecutionContext* context)
{
return !context->getEngine().bindingIsInput(0); // 0 (false) if bindingIsInput(0), 1 (true) otherwise
}
void launchInference(IExecutionContext* context, cudaStream_t stream, vector<float> const& inputTensor, vector<float>& outputTensor, void** bindings, int batchSize)
{
int inputId = getBindingInputIndex(context);
cudaMemcpyAsync(bindings[inputId], inputTensor.data(), inputTensor.size() * sizeof(float), cudaMemcpyHostToDevice, stream);
context->enqueueV2(bindings, stream, nullptr);
cudaMemcpyAsync(outputTensor.data(), bindings[1 - inputId], outputTensor.size() * sizeof(float), cudaMemcpyDeviceToHost, stream);
}
void verifyOutput(vector<float> const& outputTensor, vector<float> const& referenceTensor, int size)
{
for (size_t i = 0; i < size; ++i)
{
double reference = static_cast<double>(referenceTensor[i]);
// Check absolute and relative tolerance.
if (abs(outputTensor[i] - reference) > max(abs(reference) * REL_EPSILON, ABS_EPSILON))
{
cout << "ERROR: mismatch at position " << i;
cout << " expected " << reference << ", but was " << outputTensor[i] << endl;
return;
}
}
cout << "OK" << endl;
}
void saveImageAsPGM(vector<float>& outputTensor,int H, int W)
{
FILE* pgmimg;
pgmimg = fopen("output.pgm", "wb");
fprintf(pgmimg, "P2\n");
// Writing Width and Height
fprintf(pgmimg, "%d %d\n", H, W);
// Writing the maximum gray value
fprintf(pgmimg, "255\n");
for (int i=0; i< H; ++i)
{
for(int j=0; j<W; ++j)
{
int temp = round(255* outputTensor[i*H + j]);
fprintf(pgmimg, "%d ", temp);
}
fprintf(pgmimg, "\n");
}
fclose(pgmimg);
}
int main(int argc, char* argv[])
{
// Declaring cuda engine.
unique_ptr<ICudaEngine, Destroy<ICudaEngine>> engine{nullptr};
// Declaring execution context.
unique_ptr<IExecutionContext, Destroy<IExecutionContext>> context{nullptr};
vector<float> inputTensor;
vector<float> outputTensor;
vector<float> referenceTensor;
void* bindings[2]{0};
vector<string> inputFiles;
CudaStream stream;
if (argc != 3)
{
cout << "usage: " << argv[0] << " <path_to_model.onnx> <path_to_input.pb>" << endl;
return 1;
}
string onnxModelPath(argv[1]);
inputFiles.push_back(string{argv[2]});
int batchSize = inputFiles.size();
// Create Cuda Engine.
engine.reset(createCudaEngine(onnxModelPath, batchSize));
if (!engine)
return 1;
// Assume networks takes exactly 1 input tensor and outputs 1 tensor.
assert(engine->getNbBindings() == 2);
assert(engine->bindingIsInput(0) ^ engine->bindingIsInput(1));
for (int i = 0; i < engine->getNbBindings(); ++i)
{
Dims dims{engine->getBindingDimensions(i)};
size_t size = accumulate(dims.d+1, dims.d + dims.nbDims, batchSize, multiplies<size_t>());
// Create CUDA buffer for Tensor.
cudaMalloc(&bindings[i], batchSize * size * sizeof(float));
// Resize CPU buffers to fit Tensor.
if (engine->bindingIsInput(i)){
inputTensor.resize(size);
}
else
outputTensor.resize(size);
}
// Read input tensor from ONNX file.
if (readTensor(inputFiles, inputTensor) != inputTensor.size())
{
cout << "Couldn't read input Tensor" << endl;
return 1;
}
// Create Execution Context.
context.reset(engine->createExecutionContext());
Dims dims_i{engine->getBindingDimensions(0)};
Dims4 inputDims{batchSize, dims_i.d[1], dims_i.d[2], dims_i.d[3]};
context->setBindingDimensions(0, inputDims);
launchInference(context.get(), stream, inputTensor, outputTensor, bindings, batchSize);
Dims dims{engine->getBindingDimensions(1)};
saveImageAsPGM(outputTensor, dims.d[2], dims.d[3]);
// Wait until the work is finished.
cudaStreamSynchronize(stream);
vector<string> referenceFiles;
for (string path : inputFiles)
referenceFiles.push_back(path.replace(path.rfind("input"), 5, "output"));
// Try to read and compare against reference tensor from protobuf file.
referenceTensor.resize(outputTensor.size());
if (readTensor(referenceFiles, referenceTensor) != referenceTensor.size())
{
cout << "Couldn't read reference Tensor" << endl;
return 1;
}
Dims dims_o{engine->getBindingDimensions(1)};
int size = batchSize * dims_o.d[2] * dims_o.d[3];
verifyOutput(outputTensor, referenceTensor, size);
for (void* ptr : bindings)
cudaFree(ptr);
return 0;
}
▼output.pgm
以下のようなログが出力されれば成功
: Conv_17 + Relu_19 (scudnn) Set Tactic Name: volta_scudnn_128x128_relu_xregs_large_nn_v1
: Conv_17 + Relu_19 (scudnn) Set Tactic Name: volta_scudnn_128x64_relu_small_nn_v1
: Conv_17 + Relu_19 (scudnn) Set Tactic Name: volta_scudnn_128x64_relu_medium_nn_v1
: Conv_17 + Relu_19 (scudnn) Set Tactic Name: volta_scudnn_128x32_relu_medium_nn_v1
: Conv_17 + Relu_19 (scudnn) Set Tactic Name: volta_scudnn_128x32_relu_small_nn_v1
: Conv_17 + Relu_19 (scudnn_winograd) Set Tactic Name: volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1
: *************** Autotuning format combination: Float(1,64,4096,1048576) -> Float(1,32,1024,131072) ***************
: --------------- Timing Runner: MaxPool_20 (Pooling)
: Tactic: -1 time 0.01536
: Fastest Tactic: -1 Time: 0.01536
: --------------- Timing Runner: MaxPool_20 (TiledPooling)
: Tactic: 5505281 time 0.01024
: Tactic: 5570817 time 0.008192
: Tactic: 5636353 time 0.008192
: Tactic: 5701889 time 0.007168
: Tactic: 5767425 time 0.008192
: Tactic: 5832961 time 0.007168
: Tactic: 5898497 time 0.008192
: Tactic: 5964033 time 0.008192
: Tactic: 6029569 time 0.013312
: Tactic: 6095105 time 0.009216
: Tactic: 6160641 time 0.009216
: Tactic: 6226177 time 0.008192
: Tactic: 6291713 time 0.0072
: Tactic: 6357249 time 0.007168
: Tactic: 6422785 time 0.007168
: Tactic: 6488321 time 0.007168
: Fastest Tactic: 5701889 Time: 0.007168
: >>>>>>>>>>>>>>> Chose Runner Type: TiledPooling Tactic: 5701889
:
: *************** Autotuning format combination: Float(1,32,1024,131072) -> Float(1,32,1024,262144) ***************
: Conv_21 + Relu_23 (scudnn) Set Tactic Name: volta_scudnn_128x128_relu_medium_nn_v1
: Conv_21 + Relu_23 (scudnn_winograd) Set Tactic Name: volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1
: Conv_21 + Relu_23 (scudnn) Set Tactic Name: volta_scudnn_128x64_relu_xregs_large_nn_v1
: Conv_21 + Relu_23 (scudnn) Set Tactic Name: volta_scudnn_128x128_relu_small_nn_v1
: Conv_21 + Relu_23 (scudnn) Set Tactic Name: volta_scudnn_128x128_relu_xregs_large_nn_v1
: Conv_21 + Relu_23 (scudnn) Set Tactic Name: volta_scudnn_128x64_relu_small_nn_v1
: Conv_21 + Relu_23 (scudnn) Set Tactic Name: volta_scudnn_128x64_relu_medium_nn_v1
: Conv_21 + Relu_23 (scudnn) Set Tactic Name: volta_scudnn_128x32_relu_medium_nn_v1
: Conv_21 + Relu_23 (scudnn) Set Tactic Name: volta_scudnn_128x32_relu_small_nn_v1
: --------------- Timing Runner: Conv_21 + Relu_23 (FusedConvActConvolution)
: FusedConvActConvolution has no valid tactics for this config, skipping
: --------------- Timing Runner: Conv_21 + Relu_23 (CaskConvolution)
: Conv_21 + Relu_23 (scudnn) Set Tactic Name: volta_scudnn_128x128_relu_medium_nn_v1
: Tactic: 1825138533642645384 time 0.26112
: Conv_21 + Relu_23 (scudnn_winograd) Set Tactic Name: volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1
: Tactic: 2775507031594384867 time 0.041984
: Conv_21 + Relu_23 (scudnn) Set Tactic Name: volta_scudnn_128x64_relu_xregs_large_nn_v1
: Tactic: 2842488832350522458 time 0.166944
: Conv_21 + Relu_23 (scudnn) Set Tactic Name: volta_scudnn_128x128_relu_small_nn_v1
: Tactic: 3915320020053085238 time 0.256032
: Conv_21 + Relu_23 (scudnn) Set Tactic Name: volta_scudnn_128x128_relu_xregs_large_nn_v1
: Tactic: 6448355332020552203 time 0.262176
: Conv_21 + Relu_23 (scudnn) Set Tactic Name: volta_scudnn_128x64_relu_small_nn_v1
: Tactic: 6808617066150061604 time 0.151584
: Conv_21 + Relu_23 (scudnn) Set Tactic Name: volta_scudnn_128x64_relu_medium_nn_v1
: Tactic: -8060443123034038864 time 0.172064
: Conv_21 + Relu_23 (scudnn) Set Tactic Name: volta_scudnn_128x32_relu_medium_nn_v1
: Tactic: -4420849921117327522 time 0.152608
: Conv_21 + Relu_23 (scudnn) Set Tactic Name: volta_scudnn_128x32_relu_small_nn_v1
: Tactic: -3946921629105938337 time 0.184352
: Fastest Tactic: 2775507031594384867 Time: 0.041984
: --------------- Timing Runner: Conv_21 + Relu_23 (CudaConvolution)
: Tactic: 0 time 0.126976
: Tactic: 2 time 0.16384
: Tactic: 5 time 0.992224
: Tactic: 6 time 0.056288
: Tactic: 57 time 0.1024
: Fastest Tactic: 6 Time: 0.056288
: --------------- Timing Runner: Conv_21 + Relu_23 (CudaDepthwiseConvolution)
: CudaDepthwiseConvolution has no valid tactics for this config, skipping
: >>>>>>>>>>>>>>> Chose Runner Type: CaskConvolution Tactic: 2775507031594384867
: Conv_21 + Relu_23 (scudnn_winograd) Set Tactic Name: volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1
:
: Conv_21 + Relu_23 (scudnn) Set Tactic Name: volta_scudnn_128x128_relu_medium_nn_v1
: Conv_21 + Relu_23 (scudnn_winograd) Set Tactic Name: volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1
: Conv_21 + Relu_23 (scudnn) Set Tactic Name: volta_scudnn_128x64_relu_xregs_large_nn_v1
: Conv_21 + Relu_23 (scudnn) Set Tactic Name: volta_scudnn_128x128_relu_small_nn_v1
: Conv_21 + Relu_23 (scudnn) Set Tactic Name: volta_scudnn_128x128_relu_xregs_large_nn_v1
: Conv_21 + Relu_23 (scudnn) Set Tactic Name: volta_scudnn_128x64_relu_small_nn_v1
: Conv_21 + Relu_23 (scudnn) Set Tactic Name: volta_scudnn_128x64_relu_medium_nn_v1
: Conv_21 + Relu_23 (scudnn) Set Tactic Name: volta_scudnn_128x32_relu_medium_nn_v1
: Conv_21 + Relu_23 (scudnn) Set Tactic Name: volta_scudnn_128x32_relu_small_nn_v1
: Conv_21 + Relu_23 (scudnn_winograd) Set Tactic Name: volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1
: *************** Autotuning format combination: Float(1,32,1024,262144) -> Float(1,32,1024,524288) ***************
: Conv_24 + Relu_26 (scudnn) Set Tactic Name: volta_scudnn_128x128_relu_medium_nn_v1
: Conv_24 + Relu_26 (scudnn_winograd) Set Tactic Name: volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1
: Conv_24 + Relu_26 (scudnn) Set Tactic Name: volta_scudnn_128x64_relu_xregs_large_nn_v1
: Conv_24 + Relu_26 (scudnn) Set Tactic Name: volta_scudnn_128x128_relu_small_nn_v1
: Conv_24 + Relu_26 (scudnn) Set Tactic Name: volta_scudnn_128x128_relu_xregs_large_nn_v1
: Conv_24 + Relu_26 (scudnn) Set Tactic Name: volta_scudnn_128x64_relu_small_nn_v1
: Conv_24 + Relu_26 (scudnn) Set Tactic Name: volta_scudnn_128x64_relu_medium_nn_v1
: Conv_24 + Relu_26 (scudnn) Set Tactic Name: volta_scudnn_128x32_relu_medium_nn_v1
: Conv_24 + Relu_26 (scudnn) Set Tactic Name: volta_scudnn_128x32_relu_small_nn_v1
: --------------- Timing Runner: Conv_24 + Relu_26 (FusedConvActConvolution)
: FusedConvActConvolution has no valid tactics for this config, skipping
: --------------- Timing Runner: Conv_24 + Relu_26 (CaskConvolution)
: Conv_24 + Relu_26 (scudnn) Set Tactic Name: volta_scudnn_128x128_relu_medium_nn_v1
: Tactic: 1825138533642645384 time 0.510976
: Conv_24 + Relu_26 (scudnn_winograd) Set Tactic Name: volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1
: Tactic: 2775507031594384867 time 0.073696
: Conv_24 + Relu_26 (scudnn) Set Tactic Name: volta_scudnn_128x64_relu_xregs_large_nn_v1
: Tactic: 2842488832350522458 time 0.320512
: Conv_24 + Relu_26 (scudnn) Set Tactic Name: volta_scudnn_128x128_relu_small_nn_v1
: Tactic: 3915320020053085238 time 0.500736
: Conv_24 + Relu_26 (scudnn) Set Tactic Name: volta_scudnn_128x128_relu_xregs_large_nn_v1
: Tactic: 6448355332020552203 time 0.510976
: Conv_24 + Relu_26 (scudnn) Set Tactic Name: volta_scudnn_128x64_relu_small_nn_v1
: Tactic: 6808617066150061604 time 0.289792
: Conv_24 + Relu_26 (scudnn) Set Tactic Name: volta_scudnn_128x64_relu_medium_nn_v1
: Tactic: -8060443123034038864 time 0.331776
: Conv_24 + Relu_26 (scudnn) Set Tactic Name: volta_scudnn_128x32_relu_medium_nn_v1
: Tactic: -4420849921117327522 time 0.295936
: Conv_24 + Relu_26 (scudnn) Set Tactic Name: volta_scudnn_128x32_relu_small_nn_v1
: Tactic: -3946921629105938337 time 0.357344
: Fastest Tactic: 2775507031594384867 Time: 0.073696
: --------------- Timing Runner: Conv_24 + Relu_26 (CudaConvolution)
: Tactic: 0 time 0.234464
: Tactic: 2 time 0.260096
: Tactic: 5 time 1.86368
: Tactic: 6 time 0.093184
: Tactic: 57 time 0.283648
: Fastest Tactic: 6 Time: 0.093184
: --------------- Timing Runner: Conv_24 + Relu_26 (CudaDepthwiseConvolution)
: CudaDepthwiseConvolution has no valid tactics for this config, skipping
: >>>>>>>>>>>>>>> Chose Runner Type: CaskConvolution Tactic: 2775507031594384867
: Conv_24 + Relu_26 (scudnn_winograd) Set Tactic Name: volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1
:
: Conv_24 + Relu_26 (scudnn) Set Tactic Name: volta_scudnn_128x128_relu_medium_nn_v1
: Conv_24 + Relu_26 (scudnn_winograd) Set Tactic Name: volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1
: Conv_24 + Relu_26 (scudnn) Set Tactic Name: volta_scudnn_128x64_relu_xregs_large_nn_v1
: Conv_24 + Relu_26 (scudnn) Set Tactic Name: volta_scudnn_128x128_relu_small_nn_v1
: Conv_24 + Relu_26 (scudnn) Set Tactic Name: volta_scudnn_128x128_relu_xregs_large_nn_v1
: Conv_24 + Relu_26 (scudnn) Set Tactic Name: volta_scudnn_128x64_relu_small_nn_v1
: Conv_24 + Relu_26 (scudnn) Set Tactic Name: volta_scudnn_128x64_relu_medium_nn_v1
: Conv_24 + Relu_26 (scudnn) Set Tactic Name: volta_scudnn_128x32_relu_medium_nn_v1
: Conv_24 + Relu_26 (scudnn) Set Tactic Name: volta_scudnn_128x32_relu_small_nn_v1
: Conv_24 + Relu_26 (scudnn_winograd) Set Tactic Name: volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1
: *************** Autotuning format combination: Float(1,32,1024,524288) -> Float(1,16,256,65536) ***************
: --------------- Timing Runner: MaxPool_27 (Pooling)
: Tactic: -1 time 0.008192
: Fastest Tactic: -1 Time: 0.008192
: --------------- Timing Runner: MaxPool_27 (TiledPooling)
: Tactic: 5505281 time 0.01024
: Tactic: 5570817 time 0.008192
: Tactic: 5636353 time 0.008192
: Tactic: 5701889 time 0.007168
: Tactic: 5767425 time 0.006176
: Tactic: 5832961 time 0.008192
: Tactic: 5898497 time 0.007168
: Tactic: 5964033 time 0.007168
: Tactic: 6029569 time 0.01024
: Tactic: 6095105 time 0.008192
: Tactic: 6160641 time 0.0072
: Tactic: 6226177 time 0.007168
: Tactic: 6291713 time 0.007168
: Tactic: 6357249 time 0.007168
: Tactic: 6422785 time 0.006144
: Tactic: 6488321 time 0.006144
: Fastest Tactic: 6422785 Time: 0.006144
: >>>>>>>>>>>>>>> Chose Runner Type: TiledPooling Tactic: 6422785
:
: *************** Autotuning format combination: Float(1,16,256,65536) -> Float(1,16,256,131072) ***************
: Conv_28 + Relu_30 (scudnn) Set Tactic Name: volta_scudnn_128x128_relu_medium_nn_v1
: Conv_28 + Relu_30 (scudnn_winograd) Set Tactic Name: volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1
: Conv_28 + Relu_30 (scudnn) Set Tactic Name: volta_scudnn_128x64_relu_xregs_large_nn_v1
: Conv_28 + Relu_30 (scudnn) Set Tactic Name: volta_scudnn_128x128_relu_small_nn_v1
: Conv_28 + Relu_30 (scudnn) Set Tactic Name: volta_scudnn_128x128_relu_xregs_large_nn_v1
: Conv_28 + Relu_30 (scudnn) Set Tactic Name: volta_scudnn_128x64_relu_small_nn_v1
: Conv_28 + Relu_30 (scudnn) Set Tactic Name: volta_scudnn_128x64_relu_medium_nn_v1
: Conv_28 + Relu_30 (scudnn) Set Tactic Name: volta_scudnn_128x32_relu_medium_nn_v1
: Conv_28 + Relu_30 (scudnn) Set Tactic Name: volta_scudnn_128x32_relu_small_nn_v1
: --------------- Timing Runner: Conv_28 + Relu_30 (FusedConvActConvolution)
: FusedConvActConvolution has no valid tactics for this config, skipping
: --------------- Timing Runner: Conv_28 + Relu_30 (CaskConvolution)
: Conv_28 + Relu_30 (scudnn) Set Tactic Name: volta_scudnn_128x128_relu_medium_nn_v1
: Tactic: 1825138533642645384 time 0.515072
: Conv_28 + Relu_30 (scudnn_winograd) Set Tactic Name: volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1
: Tactic: 2775507031594384867 time 0.07168
: Conv_28 + Relu_30 (scudnn) Set Tactic Name: volta_scudnn_128x64_relu_xregs_large_nn_v1
: Tactic: 2842488832350522458 time 0.332832
: Conv_28 + Relu_30 (scudnn) Set Tactic Name: volta_scudnn_128x128_relu_small_nn_v1
: Tactic: 3915320020053085238 time 0.499744
: Conv_28 + Relu_30 (scudnn) Set Tactic Name: volta_scudnn_128x128_relu_xregs_large_nn_v1
: Tactic: 6448355332020552203 time 0.51712
: Conv_28 + Relu_30 (scudnn) Set Tactic Name: volta_scudnn_128x64_relu_small_nn_v1
: Tactic: 6808617066150061604 time 0.288768
: Conv_28 + Relu_30 (scudnn) Set Tactic Name: volta_scudnn_128x64_relu_medium_nn_v1
: Tactic: -8060443123034038864 time 0.348192
: Conv_28 + Relu_30 (scudnn) Set Tactic Name: volta_scudnn_128x32_relu_medium_nn_v1
: Tactic: -4420849921117327522 time 0.304128
: Conv_28 + Relu_30 (scudnn) Set Tactic Name: volta_scudnn_128x32_relu_small_nn_v1
: Tactic: -3946921629105938337 time 0.354304
: Fastest Tactic: 2775507031594384867 Time: 0.07168
: --------------- Timing Runner: Conv_28 + Relu_30 (CudaConvolution)
: Tactic: 0 time 0.216064
: Tactic: 2 time 0.226304
: Tactic: 5 time 1.53699
: Tactic: 6 time 0.097312
: Tactic: 57 time 0.186368
: Fastest Tactic: 6 Time: 0.097312
: --------------- Timing Runner: Conv_28 + Relu_30 (CudaDepthwiseConvolution)
: CudaDepthwiseConvolution has no valid tactics for this config, skipping
: >>>>>>>>>>>>>>> Chose Runner Type: CaskConvolution Tactic: 2775507031594384867
: Conv_28 + Relu_30 (scudnn_winograd) Set Tactic Name: volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1
:
: Conv_28 + Relu_30 (scudnn) Set Tactic Name: volta_scudnn_128x128_relu_medium_nn_v1
: Conv_28 + Relu_30 (scudnn_winograd) Set Tactic Name: volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1
: Conv_28 + Relu_30 (scudnn) Set Tactic Name: volta_scudnn_128x64_relu_xregs_large_nn_v1
: Conv_28 + Relu_30 (scudnn) Set Tactic Name: volta_scudnn_128x128_relu_small_nn_v1
: Conv_28 + Relu_30 (scudnn) Set Tactic Name: volta_scudnn_128x128_relu_xregs_large_nn_v1
: Conv_28 + Relu_30 (scudnn) Set Tactic Name: volta_scudnn_128x64_relu_small_nn_v1
: Conv_28 + Relu_30 (scudnn) Set Tactic Name: volta_scudnn_128x64_relu_medium_nn_v1
: Conv_28 + Relu_30 (scudnn) Set Tactic Name: volta_scudnn_128x32_relu_medium_nn_v1
: Conv_28 + Relu_30 (scudnn) Set Tactic Name: volta_scudnn_128x32_relu_small_nn_v1
: Conv_28 + Relu_30 (scudnn_winograd) Set Tactic Name: volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1
: *************** Autotuning format combination: Float(1,16,256,131072) -> Float(1,16,256,131072) ***************
: Conv_31 + Relu_33 (scudnn) Set Tactic Name: volta_scudnn_128x128_relu_medium_nn_v1
: Conv_31 + Relu_33 (scudnn_winograd) Set Tactic Name: volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1
: Conv_31 + Relu_33 (scudnn) Set Tactic Name: volta_scudnn_128x64_relu_xregs_large_nn_v1
: Conv_31 + Relu_33 (scudnn) Set Tactic Name: volta_scudnn_128x128_relu_small_nn_v1
: Conv_31 + Relu_33 (scudnn) Set Tactic Name: volta_scudnn_128x128_relu_xregs_large_nn_v1
: Conv_31 + Relu_33 (scudnn) Set Tactic Name: volta_scudnn_128x64_relu_small_nn_v1
: Conv_31 + Relu_33 (scudnn) Set Tactic Name: volta_scudnn_128x64_relu_medium_nn_v1
: Conv_31 + Relu_33 (scudnn) Set Tactic Name: volta_scudnn_128x32_relu_medium_nn_v1
: Conv_31 + Relu_33 (scudnn) Set Tactic Name: volta_scudnn_128x32_relu_small_nn_v1
: --------------- Timing Runner: Conv_31 + Relu_33 (FusedConvActConvolution)
: FusedConvActConvolution has no valid tactics for this config, skipping
: --------------- Timing Runner: Conv_31 + Relu_33 (CaskConvolution)
: Conv_31 + Relu_33 (scudnn) Set Tactic Name: volta_scudnn_128x128_relu_medium_nn_v1
: Tactic: 1825138533642645384 time 1.01581
: Conv_31 + Relu_33 (scudnn_winograd) Set Tactic Name: volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1
: Tactic: 2775507031594384867 time 0.135136
: Conv_31 + Relu_33 (scudnn) Set Tactic Name: volta_scudnn_128x64_relu_xregs_large_nn_v1
: Tactic: 2842488832350522458 time 0.710656
: Conv_31 + Relu_33 (scudnn) Set Tactic Name: volta_scudnn_128x128_relu_small_nn_v1
: Tactic: 3915320020053085238 time 0.98816
: Conv_31 + Relu_33 (scudnn) Set Tactic Name: volta_scudnn_128x128_relu_xregs_large_nn_v1
: Tactic: 6448355332020552203 time 1.02093
: Conv_31 + Relu_33 (scudnn) Set Tactic Name: volta_scudnn_128x64_relu_small_nn_v1
: Tactic: 6808617066150061604 time 0.613376
: Conv_31 + Relu_33 (scudnn) Set Tactic Name: volta_scudnn_128x64_relu_medium_nn_v1
: Tactic: -8060443123034038864 time 0.728032
: Conv_31 + Relu_33 (scudnn) Set Tactic Name: volta_scudnn_128x32_relu_medium_nn_v1
: Tactic: -4420849921117327522 time 0.713728
: Conv_31 + Relu_33 (scudnn) Set Tactic Name: volta_scudnn_128x32_relu_small_nn_v1
: Tactic: -3946921629105938337 time 0.80384
: Fastest Tactic: 2775507031594384867 Time: 0.135136
: --------------- Timing Runner: Conv_31 + Relu_33 (CudaConvolution)
: Tactic: 0 time 0.415744
: Tactic: 2 time 0.398336
: Tactic: 5 skipped. Scratch requested: 1283457024, available: 1073741824
: Tactic: 6 time 0.178176
: Tactic: 57 time 0.425984
: Fastest Tactic: 6 Time: 0.178176
: --------------- Timing Runner: Conv_31 + Relu_33 (CudaDepthwiseConvolution)
: CudaDepthwiseConvolution has no valid tactics for this config, skipping
: >>>>>>>>>>>>>>> Chose Runner Type: CaskConvolution Tactic: 2775507031594384867
: Conv_31 + Relu_33 (scudnn_winograd) Set Tactic Name: volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1
:
: Conv_31 + Relu_33 (scudnn) Set Tactic Name: volta_scudnn_128x128_relu_medium_nn_v1
: Conv_31 + Relu_33 (scudnn_winograd) Set Tactic Name: volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1
: Conv_31 + Relu_33 (scudnn) Set Tactic Name: volta_scudnn_128x64_relu_xregs_large_nn_v1
: Conv_31 + Relu_33 (scudnn) Set Tactic Name: volta_scudnn_128x128_relu_small_nn_v1
: Conv_31 + Relu_33 (scudnn) Set Tactic Name: volta_scudnn_128x128_relu_xregs_large_nn_v1
: Conv_31 + Relu_33 (scudnn) Set Tactic Name: volta_scudnn_128x64_relu_small_nn_v1
: Conv_31 + Relu_33 (scudnn) Set Tactic Name: volta_scudnn_128x64_relu_medium_nn_v1
: Conv_31 + Relu_33 (scudnn) Set Tactic Name: volta_scudnn_128x32_relu_medium_nn_v1
: Conv_31 + Relu_33 (scudnn) Set Tactic Name: volta_scudnn_128x32_relu_small_nn_v1
: Conv_31 + Relu_33 (scudnn_winograd) Set Tactic Name: volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1
: *************** Autotuning format combination: Float(1,16,256,131072) -> Float(1,32,1024,262144) ***************
: --------------- Timing Runner: ConvTranspose_34 (CudnnDeconvolution)
: Tactic: 0 time 0.09728
: Tactic: 1 time 0.192512
: Tactic: 3 time 3.56864
: Fastest Tactic: 0 Time: 0.09728
: --------------- Timing Runner: ConvTranspose_34 (CaskDeconvolution)
: CaskDeconvolution has no valid tactics for this config, skipping
: --------------- Timing Runner: ConvTranspose_34 (GemmDeconvolution)
: Tactic: 0 time 0.050176
: Fastest Tactic: 0 Time: 0.050176
: >>>>>>>>>>>>>>> Chose Runner Type: GemmDeconvolution Tactic: 0
:
: --------------- Timing Runner: 153 copy (Reformat)
: Tactic: 1002 time 0.009216
: Tactic: 0 time 0.007168
: Fastest Tactic: 0 Time: 0.007168
: *************** Autotuning format combination: Float(1,32,1024,524288) -> Float(1,32,1024,262144) ***************
: Conv_36 + Relu_38 (scudnn) Set Tactic Name: volta_scudnn_128x128_relu_medium_nn_v1
: Conv_36 + Relu_38 (scudnn_winograd) Set Tactic Name: volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1
: Conv_36 + Relu_38 (scudnn) Set Tactic Name: volta_scudnn_128x64_relu_xregs_large_nn_v1
: Conv_36 + Relu_38 (scudnn) Set Tactic Name: volta_scudnn_128x128_relu_small_nn_v1
: Conv_36 + Relu_38 (scudnn) Set Tactic Name: volta_scudnn_128x128_relu_xregs_large_nn_v1
: Conv_36 + Relu_38 (scudnn) Set Tactic Name: volta_scudnn_128x64_relu_small_nn_v1
: Conv_36 + Relu_38 (scudnn) Set Tactic Name: volta_scudnn_128x64_relu_medium_nn_v1
: Conv_36 + Relu_38 (scudnn) Set Tactic Name: volta_scudnn_128x32_relu_medium_nn_v1
: Conv_36 + Relu_38 (scudnn) Set Tactic Name: volta_scudnn_128x32_relu_small_nn_v1
: --------------- Timing Runner: Conv_36 + Relu_38 (FusedConvActConvolution)
: FusedConvActConvolution has no valid tactics for this config, skipping
: --------------- Timing Runner: Conv_36 + Relu_38 (CaskConvolution)
: Conv_36 + Relu_38 (scudnn) Set Tactic Name: volta_scudnn_128x128_relu_medium_nn_v1
: Tactic: 1825138533642645384 time 1.01171
: Conv_36 + Relu_38 (scudnn_winograd) Set Tactic Name: volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1
: Tactic: 2775507031594384867 time 0.13616
: Conv_36 + Relu_38 (scudnn) Set Tactic Name: volta_scudnn_128x64_relu_xregs_large_nn_v1
: Tactic: 2842488832350522458 time 0.672768
: Conv_36 + Relu_38 (scudnn) Set Tactic Name: volta_scudnn_128x128_relu_small_nn_v1
: Tactic: 3915320020053085238 time 0.990208
: Conv_36 + Relu_38 (scudnn) Set Tactic Name: volta_scudnn_128x128_relu_xregs_large_nn_v1
: Tactic: 6448355332020552203 time 1.00758
: Conv_36 + Relu_38 (scudnn) Set Tactic Name: volta_scudnn_128x64_relu_small_nn_v1
: Tactic: 6808617066150061604 time 0.608256
: Conv_36 + Relu_38 (scudnn) Set Tactic Name: volta_scudnn_128x64_relu_medium_nn_v1
: Tactic: -8060443123034038864 time 0.698368
: Conv_36 + Relu_38 (scudnn) Set Tactic Name: volta_scudnn_128x32_relu_medium_nn_v1
: Tactic: -4420849921117327522 time 0.68608
: Conv_36 + Relu_38 (scudnn) Set Tactic Name: volta_scudnn_128x32_relu_small_nn_v1
: Tactic: -3946921629105938337 time 0.796672
: Fastest Tactic: 2775507031594384867 Time: 0.13616
: --------------- Timing Runner: Conv_36 + Relu_38 (CudaConvolution)
: Tactic: 0 time 0.446496
: Tactic: 2 time 0.443392
: Tactic: 5 time 3.62394
: Tactic: 6 time 0.16384
: Tactic: 57 time 0.32256
: Fastest Tactic: 6 Time: 0.16384
: --------------- Timing Runner: Conv_36 + Relu_38 (CudaDepthwiseConvolution)
: CudaDepthwiseConvolution has no valid tactics for this config, skipping
: >>>>>>>>>>>>>>> Chose Runner Type: CaskConvolution Tactic: 2775507031594384867
: Conv_36 + Relu_38 (scudnn_winograd) Set Tactic Name: volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1
:
: Conv_36 + Relu_38 (scudnn) Set Tactic Name: volta_scudnn_128x128_relu_medium_nn_v1
: Conv_36 + Relu_38 (scudnn_winograd) Set Tactic Name: volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1
: Conv_36 + Relu_38 (scudnn) Set Tactic Name: volta_scudnn_128x64_relu_xregs_large_nn_v1
: Conv_36 + Relu_38 (scudnn) Set Tactic Name: volta_scudnn_128x128_relu_small_nn_v1
: Conv_36 + Relu_38 (scudnn) Set Tactic Name: volta_scudnn_128x128_relu_xregs_large_nn_v1
: Conv_36 + Relu_38 (scudnn) Set Tactic Name: volta_scudnn_128x64_relu_small_nn_v1
: Conv_36 + Relu_38 (scudnn) Set Tactic Name: volta_scudnn_128x64_relu_medium_nn_v1
: Conv_36 + Relu_38 (scudnn) Set Tactic Name: volta_scudnn_128x32_relu_medium_nn_v1
: Conv_36 + Relu_38 (scudnn) Set Tactic Name: volta_scudnn_128x32_relu_small_nn_v1
: Conv_36 + Relu_38 (scudnn_winograd) Set Tactic Name: volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1
: *************** Autotuning format combination: Float(1,32,1024,262144) -> Float(1,32,1024,262144) ***************
: Conv_39 + Relu_41 (scudnn) Set Tactic Name: volta_scudnn_128x128_relu_medium_nn_v1
: Conv_39 + Relu_41 (scudnn_winograd) Set Tactic Name: volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1
: Conv_39 + Relu_41 (scudnn) Set Tactic Name: volta_scudnn_128x64_relu_xregs_large_nn_v1
: Conv_39 + Relu_41 (scudnn) Set Tactic Name: volta_scudnn_128x128_relu_small_nn_v1
: Conv_39 + Relu_41 (scudnn) Set Tactic Name: volta_scudnn_128x128_relu_xregs_large_nn_v1
: Conv_39 + Relu_41 (scudnn) Set Tactic Name: volta_scudnn_128x64_relu_small_nn_v1
: Conv_39 + Relu_41 (scudnn) Set Tactic Name: volta_scudnn_128x64_relu_medium_nn_v1
: Conv_39 + Relu_41 (scudnn) Set Tactic Name: volta_scudnn_128x32_relu_medium_nn_v1
: Conv_39 + Relu_41 (scudnn) Set Tactic Name: volta_scudnn_128x32_relu_small_nn_v1
: --------------- Timing Runner: Conv_39 + Relu_41 (FusedConvActConvolution)
: FusedConvActConvolution has no valid tactics for this config, skipping
: --------------- Timing Runner: Conv_39 + Relu_41 (CaskConvolution)
: Conv_39 + Relu_41 (scudnn) Set Tactic Name: volta_scudnn_128x128_relu_medium_nn_v1
: Tactic: 1825138533642645384 time 0.509952
: Conv_39 + Relu_41 (scudnn_winograd) Set Tactic Name: volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1
: Tactic: 2775507031594384867 time 0.072704
: Conv_39 + Relu_41 (scudnn) Set Tactic Name: volta_scudnn_128x64_relu_xregs_large_nn_v1
: Tactic: 2842488832350522458 time 0.321536
: Conv_39 + Relu_41 (scudnn) Set Tactic Name: volta_scudnn_128x128_relu_small_nn_v1
: Tactic: 3915320020053085238 time 0.50176
: Conv_39 + Relu_41 (scudnn) Set Tactic Name: volta_scudnn_128x128_relu_xregs_large_nn_v1
: Tactic: 6448355332020552203 time 0.510976
: Conv_39 + Relu_41 (scudnn) Set Tactic Name: volta_scudnn_128x64_relu_small_nn_v1
: Tactic: 6808617066150061604 time 0.290816
: Conv_39 + Relu_41 (scudnn) Set Tactic Name: volta_scudnn_128x64_relu_medium_nn_v1
: Tactic: -8060443123034038864 time 0.333824
: Conv_39 + Relu_41 (scudnn) Set Tactic Name: volta_scudnn_128x32_relu_medium_nn_v1
: Tactic: -4420849921117327522 time 0.295936
: Conv_39 + Relu_41 (scudnn) Set Tactic Name: volta_scudnn_128x32_relu_small_nn_v1
: Tactic: -3946921629105938337 time 0.356384
: Fastest Tactic: 2775507031594384867 Time: 0.072704
: --------------- Timing Runner: Conv_39 + Relu_41 (CudaConvolution)
: Tactic: 0 time 0.233472
: Tactic: 2 time 0.26112
: Tactic: 5 time 1.86781
: Tactic: 6 time 0.093184
: Tactic: 57 time 0.172032
: Fastest Tactic: 6 Time: 0.093184
: --------------- Timing Runner: Conv_39 + Relu_41 (CudaDepthwiseConvolution)
: CudaDepthwiseConvolution has no valid tactics for this config, skipping
: >>>>>>>>>>>>>>> Chose Runner Type: CaskConvolution Tactic: 2775507031594384867
: Conv_39 + Relu_41 (scudnn_winograd) Set Tactic Name: volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1
:
: Conv_39 + Relu_41 (scudnn) Set Tactic Name: volta_scudnn_128x128_relu_medium_nn_v1
: Conv_39 + Relu_41 (scudnn_winograd) Set Tactic Name: volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1
: Conv_39 + Relu_41 (scudnn) Set Tactic Name: volta_scudnn_128x64_relu_xregs_large_nn_v1
: Conv_39 + Relu_41 (scudnn) Set Tactic Name: volta_scudnn_128x128_relu_small_nn_v1
: Conv_39 + Relu_41 (scudnn) Set Tactic Name: volta_scudnn_128x128_relu_xregs_large_nn_v1
: Conv_39 + Relu_41 (scudnn) Set Tactic Name: volta_scudnn_128x64_relu_small_nn_v1
: Conv_39 + Relu_41 (scudnn) Set Tactic Name: volta_scudnn_128x64_relu_medium_nn_v1
: Conv_39 + Relu_41 (scudnn) Set Tactic Name: volta_scudnn_128x32_relu_medium_nn_v1
: Conv_39 + Relu_41 (scudnn) Set Tactic Name: volta_scudnn_128x32_relu_small_nn_v1
: Conv_39 + Relu_41 (scudnn_winograd) Set Tactic Name: volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1
: *************** Autotuning format combination: Float(1,32,1024,262144) -> Float(1,64,4096,524288) ***************
: --------------- Timing Runner: ConvTranspose_42 (CudnnDeconvolution)
: Tactic: 0 time 0.073728
: Tactic: 1 time 0.400352
: Tactic: 3 time 1.9415
: Fastest Tactic: 0 Time: 0.073728
: --------------- Timing Runner: ConvTranspose_42 (CaskDeconvolution)
: CaskDeconvolution has no valid tactics for this config, skipping
: --------------- Timing Runner: ConvTranspose_42 (GemmDeconvolution)
: Tactic: 0 time 0.055264
: Fastest Tactic: 0 Time: 0.055264
: >>>>>>>>>>>>>>> Chose Runner Type: GemmDeconvolution Tactic: 0
:
: --------------- Timing Runner: 161 copy (Reformat)
: Tactic: 1002 time 0.01024
: Tactic: 0 time 0.008192
: Fastest Tactic: 0 Time: 0.008192
: *************** Autotuning format combination: Float(1,64,4096,1048576) -> Float(1,64,4096,524288) ***************
: Conv_44 + Relu_46 (scudnn) Set Tactic Name: volta_scudnn_128x128_relu_medium_nn_v1
: Conv_44 + Relu_46 (scudnn_winograd) Set Tactic Name: volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1
: Conv_44 + Relu_46 (scudnn) Set Tactic Name: volta_scudnn_128x64_relu_xregs_large_nn_v1
: Conv_44 + Relu_46 (scudnn) Set Tactic Name: volta_scudnn_128x128_relu_small_nn_v1
: Conv_44 + Relu_46 (scudnn) Set Tactic Name: volta_scudnn_128x128_relu_xregs_large_nn_v1
: Conv_44 + Relu_46 (scudnn) Set Tactic Name: volta_scudnn_128x64_relu_small_nn_v1
: Conv_44 + Relu_46 (scudnn) Set Tactic Name: volta_scudnn_128x64_relu_medium_nn_v1
: Conv_44 + Relu_46 (scudnn) Set Tactic Name: volta_scudnn_128x32_relu_medium_nn_v1
: Conv_44 + Relu_46 (scudnn) Set Tactic Name: volta_scudnn_128x32_relu_small_nn_v1
: --------------- Timing Runner: Conv_44 + Relu_46 (FusedConvActConvolution)
: FusedConvActConvolution has no valid tactics for this config, skipping
: --------------- Timing Runner: Conv_44 + Relu_46 (CaskConvolution)
: Conv_44 + Relu_46 (scudnn) Set Tactic Name: volta_scudnn_128x128_relu_medium_nn_v1
: Tactic: 1825138533642645384 time 0.508896
: Conv_44 + Relu_46 (scudnn_winograd) Set Tactic Name: volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1
: Tactic: 2775507031594384867 time 0.13312
: Conv_44 + Relu_46 (scudnn) Set Tactic Name: volta_scudnn_128x64_relu_xregs_large_nn_v1
: Tactic: 2842488832350522458 time 0.326656
: Conv_44 + Relu_46 (scudnn) Set Tactic Name: volta_scudnn_128x128_relu_small_nn_v1
: Tactic: 3915320020053085238 time 0.502784
: Conv_44 + Relu_46 (scudnn) Set Tactic Name: volta_scudnn_128x128_relu_xregs_large_nn_v1
: Tactic: 6448355332020552203 time 0.50896
: Conv_44 + Relu_46 (scudnn) Set Tactic Name: volta_scudnn_128x64_relu_small_nn_v1
: Tactic: 6808617066150061604 time 0.301056
: Conv_44 + Relu_46 (scudnn) Set Tactic Name: volta_scudnn_128x64_relu_medium_nn_v1
: Tactic: -8060443123034038864 time 0.32976
: Conv_44 + Relu_46 (scudnn) Set Tactic Name: volta_scudnn_128x32_relu_medium_nn_v1
: Tactic: -4420849921117327522 time 0.342016
: Conv_44 + Relu_46 (scudnn) Set Tactic Name: volta_scudnn_128x32_relu_small_nn_v1
: Tactic: -3946921629105938337 time 0.533504
: Fastest Tactic: 2775507031594384867 Time: 0.13312
: --------------- Timing Runner: Conv_44 + Relu_46 (CudaConvolution)
: Tactic: 0 time 0.4096
: Tactic: 2 skipped. Scratch requested: 1207959552, available: 1073741824
: Tactic: 5 time 1.95786
: Tactic: 6 time 0.152576
: Tactic: 57 time 0.304096
: Fastest Tactic: 6 Time: 0.152576
: --------------- Timing Runner: Conv_44 + Relu_46 (CudaDepthwiseConvolution)
: CudaDepthwiseConvolution has no valid tactics for this config, skipping
: >>>>>>>>>>>>>>> Chose Runner Type: CaskConvolution Tactic: 2775507031594384867
: Conv_44 + Relu_46 (scudnn_winograd) Set Tactic Name: volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1
:
: Conv_44 + Relu_46 (scudnn) Set Tactic Name: volta_scudnn_128x128_relu_medium_nn_v1
: Conv_44 + Relu_46 (scudnn_winograd) Set Tactic Name: volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1
: Conv_44 + Relu_46 (scudnn) Set Tactic Name: volta_scudnn_128x64_relu_xregs_large_nn_v1
: Conv_44 + Relu_46 (scudnn) Set Tactic Name: volta_scudnn_128x128_relu_small_nn_v1
: Conv_44 + Relu_46 (scudnn) Set Tactic Name: volta_scudnn_128x128_relu_xregs_large_nn_v1
: Conv_44 + Relu_46 (scudnn) Set Tactic Name: volta_scudnn_128x64_relu_small_nn_v1
: Conv_44 + Relu_46 (scudnn) Set Tactic Name: volta_scudnn_128x64_relu_medium_nn_v1
: Conv_44 + Relu_46 (scudnn) Set Tactic Name: volta_scudnn_128x32_relu_medium_nn_v1
: Conv_44 + Relu_46 (scudnn) Set Tactic Name: volta_scudnn_128x32_relu_small_nn_v1
: Conv_44 + Relu_46 (scudnn_winograd) Set Tactic Name: volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1
: *************** Autotuning format combination: Float(1,64,4096,524288) -> Float(1,64,4096,524288) ***************
: Conv_47 + Relu_49 (scudnn) Set Tactic Name: volta_scudnn_128x128_relu_medium_nn_v1
: Conv_47 + Relu_49 (scudnn_winograd) Set Tactic Name: volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1
: Conv_47 + Relu_49 (scudnn) Set Tactic Name: volta_scudnn_128x64_relu_xregs_large_nn_v1
: Conv_47 + Relu_49 (scudnn) Set Tactic Name: volta_scudnn_128x128_relu_small_nn_v1
: Conv_47 + Relu_49 (scudnn) Set Tactic Name: volta_scudnn_128x128_relu_xregs_large_nn_v1
: Conv_47 + Relu_49 (scudnn) Set Tactic Name: volta_scudnn_128x64_relu_small_nn_v1
: Conv_47 + Relu_49 (scudnn) Set Tactic Name: volta_scudnn_128x64_relu_medium_nn_v1
: Conv_47 + Relu_49 (scudnn) Set Tactic Name: volta_scudnn_128x32_relu_medium_nn_v1
: Conv_47 + Relu_49 (scudnn) Set Tactic Name: volta_scudnn_128x32_relu_small_nn_v1
: --------------- Timing Runner: Conv_47 + Relu_49 (FusedConvActConvolution)
: FusedConvActConvolution has no valid tactics for this config, skipping
: --------------- Timing Runner: Conv_47 + Relu_49 (CaskConvolution)
: Conv_47 + Relu_49 (scudnn) Set Tactic Name: volta_scudnn_128x128_relu_medium_nn_v1
: Tactic: 1825138533642645384 time 0.26112
: Conv_47 + Relu_49 (scudnn_winograd) Set Tactic Name: volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1
: Tactic: 2775507031594384867 time 0.073728
: Conv_47 + Relu_49 (scudnn) Set Tactic Name: volta_scudnn_128x64_relu_xregs_large_nn_v1
: Tactic: 2842488832350522458 time 0.16384
: Conv_47 + Relu_49 (scudnn) Set Tactic Name: volta_scudnn_128x128_relu_small_nn_v1
: Tactic: 3915320020053085238 time 0.25904
: Conv_47 + Relu_49 (scudnn) Set Tactic Name: volta_scudnn_128x128_relu_xregs_large_nn_v1
: Tactic: 6448355332020552203 time 0.26112
: Conv_47 + Relu_49 (scudnn) Set Tactic Name: volta_scudnn_128x64_relu_small_nn_v1
: Tactic: 6808617066150061604 time 0.1536
: Conv_47 + Relu_49 (scudnn) Set Tactic Name: volta_scudnn_128x64_relu_medium_nn_v1
: Tactic: -8060443123034038864 time 0.166912
: Conv_47 + Relu_49 (scudnn) Set Tactic Name: volta_scudnn_128x32_relu_medium_nn_v1
: Tactic: -4420849921117327522 time 0.172032
: Conv_47 + Relu_49 (scudnn) Set Tactic Name: volta_scudnn_128x32_relu_small_nn_v1
: Tactic: -3946921629105938337 time 0.274432
: Fastest Tactic: 2775507031594384867 Time: 0.073728
: --------------- Timing Runner: Conv_47 + Relu_49 (CudaConvolution)
: Tactic: 0 time 0.218112
: Tactic: 2 time 0.260096
: Tactic: 5 time 1.09469
: Tactic: 6 time 0.089088
: Tactic: 57 time 0.170976
: Fastest Tactic: 6 Time: 0.089088
: --------------- Timing Runner: Conv_47 + Relu_49 (CudaDepthwiseConvolution)
: CudaDepthwiseConvolution has no valid tactics for this config, skipping
: >>>>>>>>>>>>>>> Chose Runner Type: CaskConvolution Tactic: 2775507031594384867
: Conv_47 + Relu_49 (scudnn_winograd) Set Tactic Name: volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1
:
: Conv_47 + Relu_49 (scudnn) Set Tactic Name: volta_scudnn_128x128_relu_medium_nn_v1
: Conv_47 + Relu_49 (scudnn_winograd) Set Tactic Name: volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1
: Conv_47 + Relu_49 (scudnn) Set Tactic Name: volta_scudnn_128x64_relu_xregs_large_nn_v1
: Conv_47 + Relu_49 (scudnn) Set Tactic Name: volta_scudnn_128x128_relu_small_nn_v1
: Conv_47 + Relu_49 (scudnn) Set Tactic Name: volta_scudnn_128x128_relu_xregs_large_nn_v1
: Conv_47 + Relu_49 (scudnn) Set Tactic Name: volta_scudnn_128x64_relu_small_nn_v1
: Conv_47 + Relu_49 (scudnn) Set Tactic Name: volta_scudnn_128x64_relu_medium_nn_v1
: Conv_47 + Relu_49 (scudnn) Set Tactic Name: volta_scudnn_128x32_relu_medium_nn_v1
: Conv_47 + Relu_49 (scudnn) Set Tactic Name: volta_scudnn_128x32_relu_small_nn_v1
: Conv_47 + Relu_49 (scudnn_winograd) Set Tactic Name: volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1
: *************** Autotuning format combination: Float(1,64,4096,524288) -> Float(1,128,16384,1048576) ***************
: --------------- Timing Runner: ConvTranspose_50 (CudnnDeconvolution)
: Tactic: 0 time 0.08496
: Tactic: 1 time 0.415744
: Tactic: 3 time 1.61894
: Fastest Tactic: 0 Time: 0.08496
: --------------- Timing Runner: ConvTranspose_50 (CaskDeconvolution)
: CaskDeconvolution has no valid tactics for this config, skipping
: --------------- Timing Runner: ConvTranspose_50 (GemmDeconvolution)
: Tactic: 0 time 0.068576
: Fastest Tactic: 0 Time: 0.068576
: >>>>>>>>>>>>>>> Chose Runner Type: GemmDeconvolution Tactic: 0
:
: --------------- Timing Runner: 169 copy (Reformat)
: Tactic: 1002 time 0.027616
: Tactic: 0 time 0.016384
: Fastest Tactic: 0 Time: 0.016384
: *************** Autotuning format combination: Float(1,128,16384,2097152) -> Float(1,128,16384,1048576) ***************
: Conv_52 + Relu_54 (scudnn) Set Tactic Name: volta_scudnn_128x128_relu_medium_nn_v1
: Conv_52 + Relu_54 (scudnn_winograd) Set Tactic Name: volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1
: Conv_52 + Relu_54 (scudnn) Set Tactic Name: volta_scudnn_128x64_relu_xregs_large_nn_v1
: Conv_52 + Relu_54 (scudnn) Set Tactic Name: volta_scudnn_128x128_relu_small_nn_v1
: Conv_52 + Relu_54 (scudnn) Set Tactic Name: volta_scudnn_128x128_relu_xregs_large_nn_v1
: Conv_52 + Relu_54 (scudnn) Set Tactic Name: volta_scudnn_128x64_relu_small_nn_v1
: Conv_52 + Relu_54 (scudnn) Set Tactic Name: volta_scudnn_128x64_relu_medium_nn_v1
: Conv_52 + Relu_54 (scudnn) Set Tactic Name: volta_scudnn_128x32_relu_medium_nn_v1
: Conv_52 + Relu_54 (scudnn) Set Tactic Name: volta_scudnn_128x32_relu_small_nn_v1
: --------------- Timing Runner: Conv_52 + Relu_54 (FusedConvActConvolution)
: FusedConvActConvolution has no valid tactics for this config, skipping
: --------------- Timing Runner: Conv_52 + Relu_54 (CaskConvolution)
: Conv_52 + Relu_54 (scudnn) Set Tactic Name: volta_scudnn_128x128_relu_medium_nn_v1
: Tactic: 1825138533642645384 time 0.502784
: Conv_52 + Relu_54 (scudnn_winograd) Set Tactic Name: volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1
: Tactic: 2775507031594384867 time 0.141312
: Conv_52 + Relu_54 (scudnn) Set Tactic Name: volta_scudnn_128x64_relu_xregs_large_nn_v1
: Tactic: 2842488832350522458 time 0.26624
: Conv_52 + Relu_54 (scudnn) Set Tactic Name: volta_scudnn_128x128_relu_small_nn_v1
: Tactic: 3915320020053085238 time 0.500736
: Conv_52 + Relu_54 (scudnn) Set Tactic Name: volta_scudnn_128x128_relu_xregs_large_nn_v1
: Tactic: 6448355332020552203 time 0.512
: Conv_52 + Relu_54 (scudnn) Set Tactic Name: volta_scudnn_128x64_relu_small_nn_v1
: Tactic: 6808617066150061604 time 0.266272
: Conv_52 + Relu_54 (scudnn) Set Tactic Name: volta_scudnn_128x64_relu_medium_nn_v1
: Tactic: -8060443123034038864 time 0.271392
: Conv_52 + Relu_54 (scudnn) Set Tactic Name: volta_scudnn_128x32_relu_medium_nn_v1
: Tactic: -4420849921117327522 time 0.267296
: Conv_52 + Relu_54 (scudnn) Set Tactic Name: volta_scudnn_128x32_relu_small_nn_v1
: Tactic: -3946921629105938337 time 0.306176
: Fastest Tactic: 2775507031594384867 Time: 0.141312
: --------------- Timing Runner: Conv_52 + Relu_54 (CudaConvolution)
: Tactic: 0 time 0.367584
: Tactic: 2 skipped. Scratch requested: 2415919104, available: 1073741824
: Tactic: 5 time 1.65168
: Tactic: 6 time 0.163808
: Tactic: 57 time 0.288768
: Fastest Tactic: 6 Time: 0.163808
: --------------- Timing Runner: Conv_52 + Relu_54 (CudaDepthwiseConvolution)
: CudaDepthwiseConvolution has no valid tactics for this config, skipping
: >>>>>>>>>>>>>>> Chose Runner Type: CaskConvolution Tactic: 2775507031594384867
: Conv_52 + Relu_54 (scudnn_winograd) Set Tactic Name: volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1
:
: Conv_52 + Relu_54 (scudnn) Set Tactic Name: volta_scudnn_128x128_relu_medium_nn_v1
: Conv_52 + Relu_54 (scudnn_winograd) Set Tactic Name: volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1
: Conv_52 + Relu_54 (scudnn) Set Tactic Name: volta_scudnn_128x64_relu_xregs_large_nn_v1
: Conv_52 + Relu_54 (scudnn) Set Tactic Name: volta_scudnn_128x128_relu_small_nn_v1
: Conv_52 + Relu_54 (scudnn) Set Tactic Name: volta_scudnn_128x128_relu_xregs_large_nn_v1
: Conv_52 + Relu_54 (scudnn) Set Tactic Name: volta_scudnn_128x64_relu_small_nn_v1
: Conv_52 + Relu_54 (scudnn) Set Tactic Name: volta_scudnn_128x64_relu_medium_nn_v1
: Conv_52 + Relu_54 (scudnn) Set Tactic Name: volta_scudnn_128x32_relu_medium_nn_v1
: Conv_52 + Relu_54 (scudnn) Set Tactic Name: volta_scudnn_128x32_relu_small_nn_v1
: Conv_52 + Relu_54 (scudnn_winograd) Set Tactic Name: volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1
: *************** Autotuning format combination: Float(1,128,16384,1048576) -> Float(1,128,16384,1048576) ***************
: Conv_55 + Relu_57 (scudnn) Set Tactic Name: volta_scudnn_128x128_relu_medium_nn_v1
: Conv_55 + Relu_57 (scudnn_winograd) Set Tactic Name: volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1
: Conv_55 + Relu_57 (scudnn) Set Tactic Name: volta_scudnn_128x64_relu_xregs_large_nn_v1
: Conv_55 + Relu_57 (scudnn) Set Tactic Name: volta_scudnn_128x128_relu_small_nn_v1
: Conv_55 + Relu_57 (scudnn) Set Tactic Name: volta_scudnn_128x128_relu_xregs_large_nn_v1
: Conv_55 + Relu_57 (scudnn) Set Tactic Name: volta_scudnn_128x64_relu_small_nn_v1
: Conv_55 + Relu_57 (scudnn) Set Tactic Name: volta_scudnn_128x64_relu_medium_nn_v1
: Conv_55 + Relu_57 (scudnn) Set Tactic Name: volta_scudnn_128x32_relu_medium_nn_v1
: Conv_55 + Relu_57 (scudnn) Set Tactic Name: volta_scudnn_128x32_relu_small_nn_v1
: --------------- Timing Runner: Conv_55 + Relu_57 (FusedConvActConvolution)
: FusedConvActConvolution has no valid tactics for this config, skipping
: --------------- Timing Runner: Conv_55 + Relu_57 (CaskConvolution)
: Conv_55 + Relu_57 (scudnn) Set Tactic Name: volta_scudnn_128x128_relu_medium_nn_v1
: Tactic: 1825138533642645384 time 0.260064
: Conv_55 + Relu_57 (scudnn_winograd) Set Tactic Name: volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1
: Tactic: 2775507031594384867 time 0.077856
: Conv_55 + Relu_57 (scudnn) Set Tactic Name: volta_scudnn_128x64_relu_xregs_large_nn_v1
: Tactic: 2842488832350522458 time 0.139296
: Conv_55 + Relu_57 (scudnn) Set Tactic Name: volta_scudnn_128x128_relu_small_nn_v1
: Tactic: 3915320020053085238 time 0.257056
: Conv_55 + Relu_57 (scudnn) Set Tactic Name: volta_scudnn_128x128_relu_xregs_large_nn_v1
: Tactic: 6448355332020552203 time 0.266272
: Conv_55 + Relu_57 (scudnn) Set Tactic Name: volta_scudnn_128x64_relu_small_nn_v1
: Tactic: 6808617066150061604 time 0.141344
: Conv_55 + Relu_57 (scudnn) Set Tactic Name: volta_scudnn_128x64_relu_medium_nn_v1
: Tactic: -8060443123034038864 time 0.144384
: Conv_55 + Relu_57 (scudnn) Set Tactic Name: volta_scudnn_128x32_relu_medium_nn_v1
: Tactic: -4420849921117327522 time 0.144384
: Conv_55 + Relu_57 (scudnn) Set Tactic Name: volta_scudnn_128x32_relu_small_nn_v1
: Tactic: -3946921629105938337 time 0.160768
: Fastest Tactic: 2775507031594384867 Time: 0.077856
: --------------- Timing Runner: Conv_55 + Relu_57 (CudaConvolution)
: Tactic: 0 time 0.200672
: Tactic: 2 skipped. Scratch requested: 1207959552, available: 1073741824
: Tactic: 5 time 1.07622
: Tactic: 6 time 0.102368
: Tactic: 57 time 0.162816
: Fastest Tactic: 6 Time: 0.102368
: --------------- Timing Runner: Conv_55 + Relu_57 (CudaDepthwiseConvolution)
: CudaDepthwiseConvolution has no valid tactics for this config, skipping
: >>>>>>>>>>>>>>> Chose Runner Type: CaskConvolution Tactic: 2775507031594384867
: Conv_55 + Relu_57 (scudnn_winograd) Set Tactic Name: volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1
:
: Conv_55 + Relu_57 (scudnn) Set Tactic Name: volta_scudnn_128x128_relu_medium_nn_v1
: Conv_55 + Relu_57 (scudnn_winograd) Set Tactic Name: volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1
: Conv_55 + Relu_57 (scudnn) Set Tactic Name: volta_scudnn_128x64_relu_xregs_large_nn_v1
: Conv_55 + Relu_57 (scudnn) Set Tactic Name: volta_scudnn_128x128_relu_small_nn_v1
: Conv_55 + Relu_57 (scudnn) Set Tactic Name: volta_scudnn_128x128_relu_xregs_large_nn_v1
: Conv_55 + Relu_57 (scudnn) Set Tactic Name: volta_scudnn_128x64_relu_small_nn_v1
: Conv_55 + Relu_57 (scudnn) Set Tactic Name: volta_scudnn_128x64_relu_medium_nn_v1
: Conv_55 + Relu_57 (scudnn) Set Tactic Name: volta_scudnn_128x32_relu_medium_nn_v1
: Conv_55 + Relu_57 (scudnn) Set Tactic Name: volta_scudnn_128x32_relu_small_nn_v1
: Conv_55 + Relu_57 (scudnn_winograd) Set Tactic Name: volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1
: *************** Autotuning format combination: Float(1,128,16384,1048576) -> Float(1,256,65536,2097152) ***************
: --------------- Timing Runner: ConvTranspose_58 (CudnnDeconvolution)
: Tactic: 0 time 0.137216
: Tactic: 1 time 0.372736
: Tactic: 3 time 2.36646
: Fastest Tactic: 0 Time: 0.137216
: --------------- Timing Runner: ConvTranspose_58 (CaskDeconvolution)
: CaskDeconvolution has no valid tactics for this config, skipping
: --------------- Timing Runner: ConvTranspose_58 (GemmDeconvolution)
: Tactic: 0 time 0.08496
: Fastest Tactic: 0 Time: 0.08496
: >>>>>>>>>>>>>>> Chose Runner Type: GemmDeconvolution Tactic: 0
:
: --------------- Timing Runner: 177 copy (Reformat)
: Tactic: 1002 time 0.048128
: Tactic: 0 time 0.033792
: Fastest Tactic: 0 Time: 0.033792
: *************** Autotuning format combination: Float(1,256,65536,4194304) -> Float(1,256,65536,2097152) ***************
: Conv_60 + Relu_62 (scudnn) Set Tactic Name: volta_scudnn_128x128_relu_medium_nn_v1
: Conv_60 + Relu_62 (scudnn_winograd) Set Tactic Name: volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1
: Conv_60 + Relu_62 (scudnn) Set Tactic Name: volta_scudnn_128x64_relu_xregs_large_nn_v1
: Conv_60 + Relu_62 (scudnn) Set Tactic Name: volta_scudnn_128x128_relu_small_nn_v1
: Conv_60 + Relu_62 (scudnn) Set Tactic Name: volta_scudnn_128x128_relu_xregs_large_nn_v1
: Conv_60 + Relu_62 (scudnn) Set Tactic Name: volta_scudnn_128x64_relu_small_nn_v1
: Conv_60 + Relu_62 (scudnn) Set Tactic Name: volta_scudnn_128x64_relu_medium_nn_v1
: Conv_60 + Relu_62 (scudnn) Set Tactic Name: volta_scudnn_128x32_relu_medium_nn_v1
: Conv_60 + Relu_62 (scudnn) Set Tactic Name: volta_scudnn_128x32_relu_small_nn_v1
: --------------- Timing Runner: Conv_60 + Relu_62 (FusedConvActConvolution)
: FusedConvActConvolution has no valid tactics for this config, skipping
: --------------- Timing Runner: Conv_60 + Relu_62 (CaskConvolution)
: Conv_60 + Relu_62 (scudnn) Set Tactic Name: volta_scudnn_128x128_relu_medium_nn_v1
: Tactic: 1825138533642645384 time 0.872448
: Conv_60 + Relu_62 (scudnn_winograd) Set Tactic Name: volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1
: Tactic: 2775507031594384867 time 0.13824
: Conv_60 + Relu_62 (scudnn) Set Tactic Name: volta_scudnn_128x64_relu_xregs_large_nn_v1
: Tactic: 2842488832350522458 time 0.462848
: Conv_60 + Relu_62 (scudnn) Set Tactic Name: volta_scudnn_128x128_relu_small_nn_v1
: Tactic: 3915320020053085238 time 0.86736
: Conv_60 + Relu_62 (scudnn) Set Tactic Name: volta_scudnn_128x128_relu_xregs_large_nn_v1
: Tactic: 6448355332020552203 time 0.910368
: Conv_60 + Relu_62 (scudnn) Set Tactic Name: volta_scudnn_128x64_relu_small_nn_v1
: Tactic: 6808617066150061604 time 0.452608
: Conv_60 + Relu_62 (scudnn) Set Tactic Name: volta_scudnn_128x64_relu_medium_nn_v1
: Tactic: -8060443123034038864 time 0.459776
: Conv_60 + Relu_62 (scudnn) Set Tactic Name: volta_scudnn_128x32_relu_medium_nn_v1
: Tactic: -4420849921117327522 time 0.252928
: Conv_60 + Relu_62 (scudnn) Set Tactic Name: volta_scudnn_128x32_relu_small_nn_v1
: Tactic: -3946921629105938337 time 0.285728
: Fastest Tactic: 2775507031594384867 Time: 0.13824
: --------------- Timing Runner: Conv_60 + Relu_62 (CudaConvolution)
: Tactic: 0 time 0.395232
: Tactic: 2 skipped. Scratch requested: 4831838208, available: 1073741824
: Tactic: 5 time 2.41766
: Tactic: 6 time 0.187392
: Tactic: 57 time 0.316416
: Fastest Tactic: 6 Time: 0.187392
: --------------- Timing Runner: Conv_60 + Relu_62 (CudaDepthwiseConvolution)
: CudaDepthwiseConvolution has no valid tactics for this config, skipping
: >>>>>>>>>>>>>>> Chose Runner Type: CaskConvolution Tactic: 2775507031594384867
: Conv_60 + Relu_62 (scudnn_winograd) Set Tactic Name: volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1
:
: Conv_60 + Relu_62 (scudnn) Set Tactic Name: volta_scudnn_128x128_relu_medium_nn_v1
: Conv_60 + Relu_62 (scudnn_winograd) Set Tactic Name: volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1
: Conv_60 + Relu_62 (scudnn) Set Tactic Name: volta_scudnn_128x64_relu_xregs_large_nn_v1
: Conv_60 + Relu_62 (scudnn) Set Tactic Name: volta_scudnn_128x128_relu_small_nn_v1
: Conv_60 + Relu_62 (scudnn) Set Tactic Name: volta_scudnn_128x128_relu_xregs_large_nn_v1
: Conv_60 + Relu_62 (scudnn) Set Tactic Name: volta_scudnn_128x64_relu_small_nn_v1
: Conv_60 + Relu_62 (scudnn) Set Tactic Name: volta_scudnn_128x64_relu_medium_nn_v1
: Conv_60 + Relu_62 (scudnn) Set Tactic Name: volta_scudnn_128x32_relu_medium_nn_v1
: Conv_60 + Relu_62 (scudnn) Set Tactic Name: volta_scudnn_128x32_relu_small_nn_v1
: Conv_60 + Relu_62 (scudnn_winograd) Set Tactic Name: volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1
: *************** Autotuning format combination: Float(1,256,65536,2097152) -> Float(1,256,65536,2097152) ***************
: Conv_63 + Relu_65 (scudnn) Set Tactic Name: volta_scudnn_128x128_relu_medium_nn_v1
: Conv_63 + Relu_65 (scudnn_winograd) Set Tactic Name: volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1
: Conv_63 + Relu_65 (scudnn) Set Tactic Name: volta_scudnn_128x64_relu_xregs_large_nn_v1
: Conv_63 + Relu_65 (scudnn) Set Tactic Name: volta_scudnn_128x128_relu_small_nn_v1
: Conv_63 + Relu_65 (scudnn) Set Tactic Name: volta_scudnn_128x128_relu_xregs_large_nn_v1
: Conv_63 + Relu_65 (scudnn) Set Tactic Name: volta_scudnn_128x64_relu_small_nn_v1
: Conv_63 + Relu_65 (scudnn) Set Tactic Name: volta_scudnn_128x64_relu_medium_nn_v1
: Conv_63 + Relu_65 (scudnn) Set Tactic Name: volta_scudnn_128x32_relu_medium_nn_v1
: Conv_63 + Relu_65 (scudnn) Set Tactic Name: volta_scudnn_128x32_relu_small_nn_v1
: --------------- Timing Runner: Conv_63 + Relu_65 (FusedConvActConvolution)
: FusedConvActConvolution has no valid tactics for this config, skipping
: --------------- Timing Runner: Conv_63 + Relu_65 (CaskConvolution)
: Conv_63 + Relu_65 (scudnn) Set Tactic Name: volta_scudnn_128x128_relu_medium_nn_v1
: Tactic: 1825138533642645384 time 0.422912
: Conv_63 + Relu_65 (scudnn_winograd) Set Tactic Name: volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1
: Tactic: 2775507031594384867 time 0.078848
: Conv_63 + Relu_65 (scudnn) Set Tactic Name: volta_scudnn_128x64_relu_xregs_large_nn_v1
: Tactic: 2842488832350522458 time 0.224256
: Conv_63 + Relu_65 (scudnn) Set Tactic Name: volta_scudnn_128x128_relu_small_nn_v1
: Tactic: 3915320020053085238 time 0.420864
: Conv_63 + Relu_65 (scudnn) Set Tactic Name: volta_scudnn_128x128_relu_xregs_large_nn_v1
: Tactic: 6448355332020552203 time 0.454656
: Conv_63 + Relu_65 (scudnn) Set Tactic Name: volta_scudnn_128x64_relu_small_nn_v1
: Tactic: 6808617066150061604 time 0.22528
: Conv_63 + Relu_65 (scudnn) Set Tactic Name: volta_scudnn_128x64_relu_medium_nn_v1
: Tactic: -8060443123034038864 time 0.228352
: Conv_63 + Relu_65 (scudnn) Set Tactic Name: volta_scudnn_128x32_relu_medium_nn_v1
: Tactic: -4420849921117327522 time 0.126976
: Conv_63 + Relu_65 (scudnn) Set Tactic Name: volta_scudnn_128x32_relu_small_nn_v1
: Tactic: -3946921629105938337 time 0.151552
: Fastest Tactic: 2775507031594384867 Time: 0.078848
: --------------- Timing Runner: Conv_63 + Relu_65 (CudaConvolution)
: Tactic: 0 time 0.218112
: Tactic: 2 skipped. Scratch requested: 2415919104, available: 1073741824
: Tactic: 5 time 1.68448
: Tactic: 6 time 0.129024
: Tactic: 57 time 0.185344
: Fastest Tactic: 6 Time: 0.129024
: --------------- Timing Runner: Conv_63 + Relu_65 (CudaDepthwiseConvolution)
: CudaDepthwiseConvolution has no valid tactics for this config, skipping
: >>>>>>>>>>>>>>> Chose Runner Type: CaskConvolution Tactic: 2775507031594384867
: Conv_63 + Relu_65 (scudnn_winograd) Set Tactic Name: volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1
:
: Conv_63 + Relu_65 (scudnn) Set Tactic Name: volta_scudnn_128x128_relu_medium_nn_v1
: Conv_63 + Relu_65 (scudnn_winograd) Set Tactic Name: volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1
: Conv_63 + Relu_65 (scudnn) Set Tactic Name: volta_scudnn_128x64_relu_xregs_large_nn_v1
: Conv_63 + Relu_65 (scudnn) Set Tactic Name: volta_scudnn_128x128_relu_small_nn_v1
: Conv_63 + Relu_65 (scudnn) Set Tactic Name: volta_scudnn_128x128_relu_xregs_large_nn_v1
: Conv_63 + Relu_65 (scudnn) Set Tactic Name: volta_scudnn_128x64_relu_small_nn_v1
: Conv_63 + Relu_65 (scudnn) Set Tactic Name: volta_scudnn_128x64_relu_medium_nn_v1
: Conv_63 + Relu_65 (scudnn) Set Tactic Name: volta_scudnn_128x32_relu_medium_nn_v1
: Conv_63 + Relu_65 (scudnn) Set Tactic Name: volta_scudnn_128x32_relu_small_nn_v1
: Conv_63 + Relu_65 (scudnn_winograd) Set Tactic Name: volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1
: *************** Autotuning format combination: Float(1,256,65536,2097152) -> Float(1,256,65536,65536) ***************
: Conv_66 (scudnn) Set Tactic Name: volta_scudnn_128x128_relu_interior_nn_v1
: Conv_66 (scudnn) Set Tactic Name: volta_scudnn_128x128_relu_medium_nn_v1
: Conv_66 (scudnn) Set Tactic Name: volta_scudnn_128x32_relu_interior_nn_v1
: Conv_66 (scudnn) Set Tactic Name: volta_scudnn_128x128_relu_small_nn_v1
: Conv_66 (scudnn) Set Tactic Name: volta_scudnn_128x64_relu_small_nn_v1
: Conv_66 (scudnn) Set Tactic Name: volta_scudnn_128x64_relu_interior_nn_v1
: Conv_66 (scudnn) Set Tactic Name: volta_scudnn_128x64_relu_medium_nn_v1
: Conv_66 (scudnn) Set Tactic Name: volta_scudnn_128x32_relu_medium_nn_v1
: Conv_66 (scudnn) Set Tactic Name: volta_scudnn_128x32_relu_small_nn_v1
: --------------- Timing Runner: Conv_66 (FusedConvActConvolution)
: FusedConvActConvolution has no valid tactics for this config, skipping
: --------------- Timing Runner: Conv_66 (CaskConvolution)
: Conv_66 (scudnn) Set Tactic Name: volta_scudnn_128x128_relu_interior_nn_v1
: Tactic: 1754569683116234317 time 0.072704
: Conv_66 (scudnn) Set Tactic Name: volta_scudnn_128x128_relu_medium_nn_v1
: Tactic: 1825138533642645384 time 0.072704
: Conv_66 (scudnn) Set Tactic Name: volta_scudnn_128x32_relu_interior_nn_v1
: Tactic: 2733356012094739613 time 0.029696
: Conv_66 (scudnn) Set Tactic Name: volta_scudnn_128x128_relu_small_nn_v1
: Tactic: 3915320020053085238 time 0.070656
: Conv_66 (scudnn) Set Tactic Name: volta_scudnn_128x64_relu_small_nn_v1
: Tactic: 6808617066150061604 time 0.043008
: Conv_66 (scudnn) Set Tactic Name: volta_scudnn_128x64_relu_interior_nn_v1
: Tactic: 9091006216302412844 time 0.043008
: Conv_66 (scudnn) Set Tactic Name: volta_scudnn_128x64_relu_medium_nn_v1
: Tactic: -8060443123034038864 time 0.043008
: Conv_66 (scudnn) Set Tactic Name: volta_scudnn_128x32_relu_medium_nn_v1
: Tactic: -4420849921117327522 time 0.033792
: Conv_66 (scudnn) Set Tactic Name: volta_scudnn_128x32_relu_small_nn_v1
: Tactic: -3946921629105938337 time 0.03072
: Fastest Tactic: 2733356012094739613 Time: 0.029696
: --------------- Timing Runner: Conv_66 (CudaConvolution)
: Tactic: 0 time 0.033792
: Tactic: 2 time 0.069632
: Tactic: 5 time 0.091136
: Tactic: 57 time 0.044032
: Fastest Tactic: 0 Time: 0.033792
: --------------- Timing Runner: Conv_66 (CudaDepthwiseConvolution)
: CudaDepthwiseConvolution has no valid tactics for this config, skipping
: >>>>>>>>>>>>>>> Chose Runner Type: CaskConvolution Tactic: 2733356012094739613
: Conv_66 (scudnn) Set Tactic Name: volta_scudnn_128x32_relu_interior_nn_v1
:
: Conv_66 (scudnn) Set Tactic Name: volta_scudnn_128x128_relu_interior_nn_v1
: Conv_66 (scudnn) Set Tactic Name: volta_scudnn_128x128_relu_medium_nn_v1
: Conv_66 (scudnn) Set Tactic Name: volta_scudnn_128x32_relu_interior_nn_v1
: Conv_66 (scudnn) Set Tactic Name: volta_scudnn_128x128_relu_small_nn_v1
: Conv_66 (scudnn) Set Tactic Name: volta_scudnn_128x64_relu_small_nn_v1
: Conv_66 (scudnn) Set Tactic Name: volta_scudnn_128x64_relu_interior_nn_v1
: Conv_66 (scudnn) Set Tactic Name: volta_scudnn_128x64_relu_medium_nn_v1
: Conv_66 (scudnn) Set Tactic Name: volta_scudnn_128x32_relu_medium_nn_v1
: Conv_66 (scudnn) Set Tactic Name: volta_scudnn_128x32_relu_small_nn_v1
: Conv_66 (scudnn) Set Tactic Name: volta_scudnn_128x32_relu_interior_nn_v1
: *************** Autotuning format combination: Float(1,256,65536,65536) -> Float(1,256,65536,65536) ***************
: --------------- Timing Runner: Sigmoid_67 (Activation)
: Tactic: 0 is the only option, timing skipped
: Fastest Tactic: 0 Time: 0
: Formats and tactics selection completed in 3.28856 seconds.
: After reformat layers: 32 layers
: Block size 1073741824
: Block size 536870912
: Block size 268435456
: Block size 134217728
: Block size 134217728
: Block size 67108864
: Block size 33554432
: Total Activation Memory: 2248146944
INFO: Detected 1 inputs and 1 output network tensors.
: Conv_0 + Relu_2 (scudnn) Set Tactic Name: volta_scudnn_128x32_relu_small_nn_v1
: Conv_3 + Relu_5 (scudnn_winograd) Set Tactic Name: volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1
: Conv_7 + Relu_9 (scudnn_winograd) Set Tactic Name: volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1
: Conv_10 + Relu_12 (scudnn_winograd) Set Tactic Name: volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1
: Conv_14 + Relu_16 (scudnn_winograd) Set Tactic Name: volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1
: Conv_17 + Relu_19 (scudnn_winograd) Set Tactic Name: volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1
: Conv_21 + Relu_23 (scudnn_winograd) Set Tactic Name: volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1
: Conv_24 + Relu_26 (scudnn_winograd) Set Tactic Name: volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1
: Conv_28 + Relu_30 (scudnn_winograd) Set Tactic Name: volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1
: Conv_31 + Relu_33 (scudnn_winograd) Set Tactic Name: volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1
: Conv_36 + Relu_38 (scudnn_winograd) Set Tactic Name: volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1
: Conv_39 + Relu_41 (scudnn_winograd) Set Tactic Name: volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1
: Conv_44 + Relu_46 (scudnn_winograd) Set Tactic Name: volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1
: Conv_47 + Relu_49 (scudnn_winograd) Set Tactic Name: volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1
: Conv_52 + Relu_54 (scudnn_winograd) Set Tactic Name: volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1
: Conv_55 + Relu_57 (scudnn_winograd) Set Tactic Name: volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1
: Conv_60 + Relu_62 (scudnn_winograd) Set Tactic Name: volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1
: Conv_63 + Relu_65 (scudnn_winograd) Set Tactic Name: volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1
: Conv_66 (scudnn) Set Tactic Name: volta_scudnn_128x32_relu_interior_nn_v1
: Layer: Conv_0 + Relu_2 Weights: 0 HostPersistent: 1664 DevicePersistent: 397312
: Layer: Conv_3 + Relu_5 Weights: 0 HostPersistent: 512 DevicePersistent: 102912
: Layer: MaxPool_6 Weights: 0 HostPersistent: 0 DevicePersistent: 0
: Layer: Conv_7 + Relu_9 Weights: 0 HostPersistent: 512 DevicePersistent: 205312
: Layer: Conv_10 + Relu_12 Weights: 0 HostPersistent: 512 DevicePersistent: 410112
: Layer: MaxPool_13 Weights: 0 HostPersistent: 0 DevicePersistent: 0
: Layer: Conv_14 + Relu_16 Weights: 0 HostPersistent: 512 DevicePersistent: 819712
: Layer: Conv_17 + Relu_19 Weights: 0 HostPersistent: 512 DevicePersistent: 1638912
: Layer: MaxPool_20 Weights: 0 HostPersistent: 0 DevicePersistent: 0
: Layer: Conv_21 + Relu_23 Weights: 0 HostPersistent: 512 DevicePersistent: 3277824
: Layer: Conv_24 + Relu_26 Weights: 0 HostPersistent: 512 DevicePersistent: 6554624
: Layer: MaxPool_27 Weights: 0 HostPersistent: 0 DevicePersistent: 0
: Layer: Conv_28 + Relu_30 Weights: 0 HostPersistent: 512 DevicePersistent: 13109248
: Layer: Conv_31 + Relu_33 Weights: 0 HostPersistent: 512 DevicePersistent: 26216448
: Layer: ConvTranspose_34 Weights: 2097152 HostPersistent: 0 DevicePersistent: 0
: Layer: 153 copy Weights: 0 HostPersistent: 0 DevicePersistent: 0
: Layer: Conv_36 + Relu_38 Weights: 0 HostPersistent: 512 DevicePersistent: 13108224
: Layer: Conv_39 + Relu_41 Weights: 0 HostPersistent: 512 DevicePersistent: 6554624
: Layer: ConvTranspose_42 Weights: 524288 HostPersistent: 0 DevicePersistent: 0
: Layer: 161 copy Weights: 0 HostPersistent: 0 DevicePersistent: 0
: Layer: Conv_44 + Relu_46 Weights: 0 HostPersistent: 512 DevicePersistent: 3277312
: Layer: Conv_47 + Relu_49 Weights: 0 HostPersistent: 512 DevicePersistent: 1638912
: Layer: ConvTranspose_50 Weights: 131072 HostPersistent: 0 DevicePersistent: 0
: Layer: 169 copy Weights: 0 HostPersistent: 0 DevicePersistent: 0
: Layer: Conv_52 + Relu_54 Weights: 0 HostPersistent: 512 DevicePersistent: 819712
: Layer: Conv_55 + Relu_57 Weights: 0 HostPersistent: 512 DevicePersistent: 410112
: Layer: ConvTranspose_58 Weights: 32768 HostPersistent: 0 DevicePersistent: 0
: Layer: 177 copy Weights: 0 HostPersistent: 0 DevicePersistent: 0
: Layer: Conv_60 + Relu_62 Weights: 0 HostPersistent: 512 DevicePersistent: 205312
: Layer: Conv_63 + Relu_65 Weights: 0 HostPersistent: 512 DevicePersistent: 102912
: Layer: Conv_66 Weights: 0 HostPersistent: 3200 DevicePersistent: 393728
: Layer: Sigmoid_67 Weights: 0 HostPersistent: 0 DevicePersistent: 0
: Total Host Persistent Memory: 13568
: Total Device Persistent Memory: 79243264
: Total Weight Memory: 2785280
: Builder timing cache: created 19 entries, 0 hit(s)
: Engine generation completed in 4.04331 seconds.
: Engine Layer Information:
: Layer(scudnn): Conv_0 + Relu_2, Tactic: -3946921629105938337, input.1[Float(3,256,256)] -> 121[Float(32,256,256)]
: Layer(scudnn_winograd): Conv_3 + Relu_5, Tactic: 2775507031594384867, 121[Float(32,256,256)] -> 178[Float(32,256,256)]
: Layer(PoolingTiled): MaxPool_6, Tactic: 5898497, 178[Float(32,256,256)] -> 125[Float(32,128,128)]
: Layer(scudnn_winograd): Conv_7 + Relu_9, Tactic: 2775507031594384867, 125[Float(32,128,128)] -> 128[Float(64,128,128)]
: Layer(scudnn_winograd): Conv_10 + Relu_12, Tactic: 2775507031594384867, 128[Float(64,128,128)] -> 170[Float(64,128,128)]
: Layer(PoolingTiled): MaxPool_13, Tactic: 5898497, 170[Float(64,128,128)] -> 132[Float(64,64,64)]
: Layer(scudnn_winograd): Conv_14 + Relu_16, Tactic: 2775507031594384867, 132[Float(64,64,64)] -> 135[Float(128,64,64)]
: Layer(scudnn_winograd): Conv_17 + Relu_19, Tactic: 2775507031594384867, 135[Float(128,64,64)] -> 162[Float(128,64,64)]
: Layer(PoolingTiled): MaxPool_20, Tactic: 5701889, 162[Float(128,64,64)] -> 139[Float(128,32,32)]
: Layer(scudnn_winograd): Conv_21 + Relu_23, Tactic: 2775507031594384867, 139[Float(128,32,32)] -> 142[Float(256,32,32)]
: Layer(scudnn_winograd): Conv_24 + Relu_26, Tactic: 2775507031594384867, 142[Float(256,32,32)] -> 154[Float(256,32,32)]
: Layer(PoolingTiled): MaxPool_27, Tactic: 6422785, 154[Float(256,32,32)] -> 146[Float(256,16,16)]
: Layer(scudnn_winograd): Conv_28 + Relu_30, Tactic: 2775507031594384867, 146[Float(256,16,16)] -> 149[Float(512,16,16)]
: Layer(scudnn_winograd): Conv_31 + Relu_33, Tactic: 2775507031594384867, 149[Float(512,16,16)] -> 152[Float(512,16,16)]
: Layer(gemmDeconvolution): ConvTranspose_34, Tactic: 0, 152[Float(512,16,16)] -> 153[Float(256,32,32)]
: Layer(Reformat): 153 copy, Tactic: 0, 153[Float(256,32,32)] -> 154[Float(256,32,32)]
: Layer(scudnn_winograd): Conv_36 + Relu_38, Tactic: 2775507031594384867, 154[Float(512,32,32)] -> 157[Float(256,32,32)]
: Layer(scudnn_winograd): Conv_39 + Relu_41, Tactic: 2775507031594384867, 157[Float(256,32,32)] -> 160[Float(256,32,32)]
: Layer(gemmDeconvolution): ConvTranspose_42, Tactic: 0, 160[Float(256,32,32)] -> 161[Float(128,64,64)]
: Layer(Reformat): 161 copy, Tactic: 0, 161[Float(128,64,64)] -> 162[Float(128,64,64)]
: Layer(scudnn_winograd): Conv_44 + Relu_46, Tactic: 2775507031594384867, 162[Float(256,64,64)] -> 165[Float(128,64,64)]
: Layer(scudnn_winograd): Conv_47 + Relu_49, Tactic: 2775507031594384867, 165[Float(128,64,64)] -> 168[Float(128,64,64)]
: Layer(gemmDeconvolution): ConvTranspose_50, Tactic: 0, 168[Float(128,64,64)] -> 169[Float(64,128,128)]
: Layer(Reformat): 169 copy, Tactic: 0, 169[Float(64,128,128)] -> 170[Float(64,128,128)]
: Layer(scudnn_winograd): Conv_52 + Relu_54, Tactic: 2775507031594384867, 170[Float(128,128,128)] -> 173[Float(64,128,128)]
: Layer(scudnn_winograd): Conv_55 + Relu_57, Tactic: 2775507031594384867, 173[Float(64,128,128)] -> 176[Float(64,128,128)]
: Layer(gemmDeconvolution): ConvTranspose_58, Tactic: 0, 176[Float(64,128,128)] -> 177[Float(32,256,256)]
: Layer(Reformat): 177 copy, Tactic: 0, 177[Float(32,256,256)] -> 178[Float(32,256,256)]
: Layer(scudnn_winograd): Conv_60 + Relu_62, Tactic: 2775507031594384867, 178[Float(64,256,256)] -> 181[Float(32,256,256)]
: Layer(scudnn_winograd): Conv_63 + Relu_65, Tactic: 2775507031594384867, 181[Float(32,256,256)] -> 184[Float(32,256,256)]
: Layer(scudnn): Conv_66, Tactic: 2733356012094739613, 184[Float(32,256,256)] -> 185[Float(1,256,256)]
: Layer(Activation): Sigmoid_67, Tactic: 0, 185[Float(1,256,256)] -> 186[Float(1,256,256)]
OK