/*******************************************************************************
 *
 * MIT License
 *
 * Copyright (c) 2021 Advanced Micro Devices, Inc.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 *
 *******************************************************************************/
#include <miopen/conv/solvers.hpp>
#include <miopen/env.hpp>
#include <miopen/handle.hpp>
#include <miopen/conv/invokers/impl_gemm_dynamic.hpp>
#include <miopen/generic_search.hpp>
#include <miopen/gcn_asm_utils.hpp>
#include <miopen/solver/implicitgemm_util.hpp>
#include <miopen/conv/asm_implicit_gemm.hpp>
#include <miopen/batched_transpose_sol.hpp>
#include <miopen/buffer_info.hpp>
#include <miopen/solver/problem_description_interpreter.hpp>

MIOPEN_DECLARE_ENV_VAR_BOOL(MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_ASM_FWD_GTC_XDLOPS_NHWC)
MIOPEN_DECLARE_ENV_VAR_BOOL(MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_ASM_PK_ATOMIC_ADD_FP16)

#define FWD_MAX_GEMM_K_SPLITS 8
// #define DEBUG_IGEMM_ASM_FWD_NHWC_CHECK_VALID_TILE_LIST

namespace miopen {
namespace solver {
namespace conv {

using ProblemDescription = miopen::conv::ProblemDescription;

static const inline std::vector<PerformanceConfigAsmImplicitGemmGTCFwdXdlopsNHWC>&
GetFwdXdlopsNHWCConfigList()
{
    // clang-format off
    static const  std::vector<PerformanceConfigAsmImplicitGemmGTCFwdXdlopsNHWC> kernel_param_list {
        {"fwd", "nhwc", miopenFloat,  0, 1, 256,  64,  16, 32, 32,  2, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 4, 4, 1}, {  1,  4,  1, 64}, { 1, 4, 1, 1}, {  1,  4,  1, 64}},
        {"fwd", "nhwc", miopenFloat,  0, 0, 256,  64,  16, 32, 32,  2, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 4, 4, 1}, {  1,  4,  1, 64}, { 1, 4, 1, 1}, {  1,  4,  1, 64}},
        {"fwd", "nhwc", miopenFloat,  0, 1, 256,  64,  16, 32, 32,  2, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 4, 4, 1}, {  1,  4,  1, 64}, { 1, 4, 1, 1}, {  1,  4,  1, 64}},
        {"fwd", "nhwc", miopenFloat,  0, 0, 256,  64,  16, 32, 32,  2, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 4, 4, 1}, {  1,  4,  1, 64}, { 1, 4, 1, 1}, {  1,  4,  1, 64}},
        {"fwd", "nhwc", miopenFloat,  0, 1, 256,  64,   4, 64, 16,  1, 1, 1, 2, 2, 0, 0, 0, 1, 0, { 1, 1, 4, 1}, {  1,  4,  1, 64}, { 1, 1, 1, 1}, {  1,  4,  1, 64}},
        {"fwd", "nhwc", miopenFloat,  0, 1, 256,  32,  16, 32, 32,  2, 1, 1, 2, 1, 0, 0, 0, 0, 0, { 1, 2, 8, 1}, {  1,  8,  1, 32}, { 1, 2, 1, 1}, {  1,  8,  1, 32}},
        {"fwd", "nhwc", miopenFloat,  0, 0, 256,  32,  16, 32, 32,  2, 1, 1, 2, 1, 0, 0, 0, 0, 0, { 1, 2, 8, 1}, {  1,  8,  1, 32}, { 1, 2, 1, 1}, {  1,  8,  1, 32}},
        {"fwd", "nhwc", miopenFloat,  0, 1, 256,  32,  16, 32, 32,  2, 1, 1, 2, 1, 0, 0, 1, 0, 0, { 1, 2, 8, 1}, {  1,  8,  1, 32}, { 1, 2, 1, 1}, {  1,  8,  1, 32}},
        {"fwd", "nhwc", miopenFloat,  0, 0, 256,  32,  16, 32, 32,  2, 1, 1, 2, 1, 0, 0, 1, 0, 0, { 1, 2, 8, 1}, {  1,  8,  1, 32}, { 1, 2, 1, 1}, {  1,  8,  1, 32}},
        {"fwd", "nhwc", miopenFloat,  0, 1, 256,  32,   8, 32, 32,  2, 1, 1, 2, 1, 0, 0, 0, 1, 0, { 1, 1, 8, 1}, {  1,  8,  1, 32}, { 1, 1, 1, 1}, {  1,  8,  1, 32}},
        {"fwd", "nhwc", miopenFloat,  0, 1, 256,  32,   4, 64, 32,  1, 1, 1, 2, 1, 0, 0, 0, 1, 0, { 1, 1, 8, 1}, {  1,  4,  1, 32}, { 1, 1, 1, 1}, {  1,  4,  1, 32}},
        {"fwd", "nhwc", miopenFloat,  0, 1, 128, 128,  16, 32, 32,  2, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 4, 2, 1}, {  1,  4,  1, 64}, { 1, 4, 2, 1}, {  1,  4,  1, 64}},
        {"fwd", "nhwc", miopenFloat,  0, 0, 128, 128,  16, 32, 32,  2, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 4, 2, 1}, {  1,  4,  1, 64}, { 1, 4, 2, 1}, {  1,  4,  1, 64}},
        {"fwd", "nhwc", miopenFloat,  0, 1, 128, 128,  16, 32, 32,  2, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 4, 2, 1}, {  1,  4,  1, 64}, { 1, 4, 2, 1}, {  1,  4,  1, 64}},
        {"fwd", "nhwc", miopenFloat,  0, 0, 128, 128,  16, 32, 32,  2, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 4, 2, 1}, {  1,  4,  1, 64}, { 1, 4, 2, 1}, {  1,  4,  1, 64}},
        {"fwd", "nhwc", miopenFloat,  0, 1, 128, 128,   8, 32, 32,  2, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 4, 1, 1}, {  1,  2,  1,128}, { 1, 4, 1, 1}, {  1,  2,  1,128}},
        {"fwd", "nhwc", miopenFloat,  0, 0, 128, 128,   8, 32, 32,  2, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 4, 1, 1}, {  1,  2,  1,128}, { 1, 4, 1, 1}, {  1,  2,  1,128}},
        {"fwd", "nhwc", miopenFloat,  0, 1, 128, 128,   8, 32, 32,  2, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 1}, {  1,  2,  1,128}, { 1, 4, 1, 1}, {  1,  2,  1,128}},
        {"fwd", "nhwc", miopenFloat,  0, 0, 128, 128,   8, 32, 32,  2, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 1}, {  1,  2,  1,128}, { 1, 4, 1, 1}, {  1,  2,  1,128}},
        {"fwd", "nhwc", miopenFloat,  0, 1, 128, 128,   8, 32, 32,  2, 1, 1, 2, 2, 0, 0, 0, 1, 0, { 1, 1, 4, 1}, {  1,  8,  1, 32}, { 1, 1, 4, 1}, {  1,  8,  1, 32}},
        {"fwd", "nhwc", miopenFloat,  0, 1, 128, 128,   4, 32, 32,  1, 1, 1, 2, 2, 0, 0, 0, 1, 0, { 1, 1, 2, 1}, {  1,  4,  1, 64}, { 1, 1, 2, 1}, {  1,  4,  1, 64}},
        {"fwd", "nhwc", miopenFloat,  0, 1, 128,  64,  32, 32, 32,  2, 1, 1, 1, 2, 0, 0, 0, 0, 1, { 1,16, 1, 1}, {  1,  2,  4, 32}, { 1, 4, 2, 1}, {  1,  8,  1, 32}},
        {"fwd", "nhwc", miopenFloat,  0, 1, 128,  64,  32, 32, 32,  2, 1, 1, 1, 2, 0, 0, 1, 0, 1, { 1,16, 1, 1}, {  1,  2,  4, 32}, { 1, 4, 2, 1}, {  1,  8,  1, 32}},
        {"fwd", "nhwc", miopenFloat,  0, 0, 128,  64,  32, 32, 32,  2, 1, 1, 1, 2, 0, 0, 0, 0, 1, { 1,16, 1, 1}, {  1,  2,  4, 32}, { 1, 4, 2, 1}, {  1,  8,  1, 32}},
        {"fwd", "nhwc", miopenFloat,  0, 0, 128,  64,  32, 32, 32,  2, 1, 1, 1, 2, 0, 0, 1, 0, 1, { 1,16, 1, 1}, {  1,  2,  4, 32}, { 1, 4, 2, 1}, {  1,  8,  1, 32}},
        {"fwd", "nhwc", miopenFloat,  0, 1, 128,  64,  16, 32, 32,  2, 1, 1, 1, 2, 0, 0, 0, 0, 1, { 1, 8, 1, 1}, {  1,  2,  4, 32}, { 1, 4, 1, 1}, {  1,  4,  1, 64}},
        {"fwd", "nhwc", miopenFloat,  0, 1, 128,  64,  16, 32, 32,  2, 1, 1, 1, 2, 0, 0, 1, 0, 1, { 1, 8, 1, 1}, {  1,  2,  4, 32}, { 1, 4, 1, 1}, {  1,  4,  1, 64}},
        {"fwd", "nhwc", miopenFloat,  0, 0, 128,  64,  16, 32, 32,  2, 1, 1, 1, 2, 0, 0, 0, 0, 1, { 1, 8, 1, 1}, {  1,  2,  4, 32}, { 1, 4, 1, 1}, {  1,  4,  1, 64}},
        {"fwd", "nhwc", miopenFloat,  0, 0, 128,  64,  16, 32, 32,  2, 1, 1, 1, 2, 0, 0, 1, 0, 1, { 1, 8, 1, 1}, {  1,  2,  4, 32}, { 1, 4, 1, 1}, {  1,  4,  1, 64}},
        {"fwd", "nhwc", miopenFloat,  0, 1, 128,  64,   8, 32, 32,  2, 1, 1, 1, 2, 0, 0, 0, 0, 1, { 1, 4, 1, 1}, {  1,  2,  4, 32}, { 1, 2, 1, 1}, {  1,  4,  1, 64}},
        {"fwd", "nhwc", miopenFloat,  0, 0, 128,  64,   8, 32, 32,  2, 1, 1, 1, 2, 0, 0, 0, 0, 1, { 1, 4, 1, 1}, {  1,  2,  4, 32}, { 1, 2, 1, 1}, {  1,  4,  1, 64}},
        {"fwd", "nhwc", miopenFloat,  0, 1, 128,  64,   8, 32, 32,  2, 1, 1, 1, 2, 0, 0, 1, 0, 1, { 1, 4, 1, 1}, {  1,  2,  4, 32}, { 1, 2, 1, 1}, {  1,  4,  1, 64}},
        {"fwd", "nhwc", miopenFloat,  0, 0, 128,  64,   8, 32, 32,  2, 1, 1, 1, 2, 0, 0, 1, 0, 1, { 1, 4, 1, 1}, {  1,  2,  4, 32}, { 1, 2, 1, 1}, {  1,  4,  1, 64}},
        {"fwd", "nhwc", miopenFloat,  0, 1, 128,  64,   8, 32, 32,  2, 1, 1, 1, 2, 0, 0, 0, 1, 0, { 1, 1, 4, 1}, {  1,  8,  1, 32}, { 1, 1, 2, 1}, {  1,  8,  1, 32}},
        {"fwd", "nhwc", miopenFloat,  0, 1, 128,  64,   4, 64, 32,  1, 1, 1, 1, 1, 0, 0, 0, 1, 0, { 1, 1, 2, 1}, {  1,  4,  1, 64}, { 1, 1, 1, 1}, {  1,  4,  1, 64}},
        {"fwd", "nhwc", miopenFloat,  0, 1, 128,  32,  32, 32, 32,  2, 1, 1, 2, 1, 0, 0, 0, 0, 0, { 1, 4, 8, 1}, {  1,  8,  1, 16}, { 1, 4, 2, 1}, {  1,  8,  1, 16}},
        {"fwd", "nhwc", miopenFloat,  0, 0, 128,  32,  32, 32, 32,  2, 1, 1, 2, 1, 0, 0, 0, 0, 0, { 1, 4, 8, 1}, {  1,  8,  1, 16}, { 1, 4, 2, 1}, {  1,  8,  1, 16}},
        {"fwd", "nhwc", miopenFloat,  0, 1, 128,  32,  32, 32, 32,  2, 1, 1, 2, 1, 0, 0, 1, 0, 0, { 1, 4, 8, 1}, {  1,  8,  1, 16}, { 1, 4, 2, 1}, {  1,  8,  1, 16}},
        {"fwd", "nhwc", miopenFloat,  0, 0, 128,  32,  32, 32, 32,  2, 1, 1, 2, 1, 0, 0, 1, 0, 0, { 1, 4, 8, 1}, {  1,  8,  1, 16}, { 1, 4, 2, 1}, {  1,  8,  1, 16}},
        {"fwd", "nhwc", miopenFloat,  0, 1, 128,  32,  16, 32, 32,  2, 1, 1, 2, 1, 0, 0, 0, 0, 0, { 1, 4, 4, 1}, {  1,  4,  1, 32}, { 1, 4, 1, 1}, {  1,  4,  1, 32}},
        {"fwd", "nhwc", miopenFloat,  0, 0, 128,  32,  16, 32, 32,  2, 1, 1, 2, 1, 0, 0, 0, 0, 0, { 1, 4, 4, 1}, {  1,  4,  1, 32}, { 1, 4, 1, 1}, {  1,  4,  1, 32}},
        {"fwd", "nhwc", miopenFloat,  0, 1, 128,  32,  16, 32, 32,  2, 1, 1, 2, 1, 0, 0, 1, 0, 0, { 1, 4, 4, 1}, {  1,  4,  1, 32}, { 1, 4, 1, 1}, {  1,  4,  1, 32}},
        {"fwd", "nhwc", miopenFloat,  0, 0, 128,  32,  16, 32, 32,  2, 1, 1, 2, 1, 0, 0, 1, 0, 0, { 1, 4, 4, 1}, {  1,  4,  1, 32}, { 1, 4, 1, 1}, {  1,  4,  1, 32}},
        {"fwd", "nhwc", miopenFloat,  0, 1, 128,  32,   8, 32, 32,  2, 1, 1, 1, 1, 0, 0, 0, 1, 0, { 1, 1, 4, 1}, {  1,  8,  1, 32}, { 1, 1, 1, 1}, {  1,  8,  1, 32}},
        {"fwd", "nhwc", miopenFloat,  0, 1, 128,  32,   4, 64, 32,  1, 1, 1, 1, 1, 0, 0, 0, 1, 0, { 1, 1, 4, 1}, {  1,  4,  1, 32}, { 1, 1, 1, 1}, {  1,  4,  1, 32}},
        {"fwd", "nhwc", miopenFloat,  0, 1,  64, 256,  16, 32, 32,  2, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 4, 1, 1}, {  1,  4,  1, 64}, { 1, 4, 4, 1}, {  1,  4,  1, 64}},
        {"fwd", "nhwc", miopenFloat,  0, 0,  64, 256,  16, 32, 32,  2, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 4, 1, 1}, {  1,  4,  1, 64}, { 1, 4, 4, 1}, {  1,  4,  1, 64}},
        {"fwd", "nhwc", miopenFloat,  0, 1,  64, 256,  16, 32, 32,  2, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 1}, {  1,  4,  1, 64}, { 1, 4, 4, 1}, {  1,  4,  1, 64}},
        {"fwd", "nhwc", miopenFloat,  0, 0,  64, 256,  16, 32, 32,  2, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 1}, {  1,  4,  1, 64}, { 1, 4, 4, 1}, {  1,  4,  1, 64}},
        {"fwd", "nhwc", miopenFloat,  0, 1,  64, 128,  16, 32, 32,  2, 1, 1, 1, 2, 0, 0, 0, 0, 0, { 1, 4, 1, 1}, {  1,  4,  1, 64}, { 1, 4, 2, 1}, {  1,  4,  1, 64}},
        {"fwd", "nhwc", miopenFloat,  0, 0,  64, 128,  16, 32, 32,  2, 1, 1, 1, 2, 0, 0, 0, 0, 0, { 1, 4, 1, 1}, {  1,  4,  1, 64}, { 1, 4, 2, 1}, {  1,  4,  1, 64}},
        {"fwd", "nhwc", miopenFloat,  0, 1,  64, 128,  16, 32, 32,  2, 1, 1, 1, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 1}, {  1,  4,  1, 64}, { 1, 4, 2, 1}, {  1,  4,  1, 64}},
        {"fwd", "nhwc", miopenFloat,  0, 0,  64, 128,  16, 32, 32,  2, 1, 1, 1, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 1}, {  1,  4,  1, 64}, { 1, 4, 2, 1}, {  1,  4,  1, 64}},
        {"fwd", "nhwc", miopenFloat,  0, 1,  64,  64,  32, 16, 16,  4, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 4, 2, 1}, {  1,  8,  1, 32}, { 1, 4, 2, 1}, {  1,  8,  1, 32}},
        {"fwd", "nhwc", miopenFloat,  0, 0,  64,  64,  32, 16, 16,  4, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 4, 2, 1}, {  1,  8,  1, 32}, { 1, 4, 2, 1}, {  1,  8,  1, 32}},
        {"fwd", "nhwc", miopenFloat,  0, 1,  64,  64,  32, 16, 16,  4, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 4, 2, 1}, {  1,  8,  1, 32}, { 1, 4, 2, 1}, {  1,  8,  1, 32}},
        {"fwd", "nhwc", miopenFloat,  0, 0,  64,  64,  32, 16, 16,  4, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 4, 2, 1}, {  1,  8,  1, 32}, { 1, 4, 2, 1}, {  1,  8,  1, 32}},
        {"fwd", "nhwc", miopenFloat,  0, 1,  64,  32,  32, 16, 16,  4, 1, 1, 2, 1, 0, 0, 0, 0, 0, { 1, 4, 2, 1}, {  1,  8,  1, 32}, { 1, 4, 1, 1}, {  1,  8,  1, 32}},
        {"fwd", "nhwc", miopenFloat,  0, 0,  64,  32,  32, 16, 16,  4, 1, 1, 2, 1, 0, 0, 0, 0, 0, { 1, 4, 2, 1}, {  1,  8,  1, 32}, { 1, 4, 1, 1}, {  1,  8,  1, 32}},
        {"fwd", "nhwc", miopenFloat,  0, 1,  64,  32,  32, 16, 16,  4, 1, 1, 2, 1, 0, 0, 1, 0, 0, { 1, 4, 2, 1}, {  1,  8,  1, 32}, { 1, 4, 1, 1}, {  1,  8,  1, 32}},
        {"fwd", "nhwc", miopenFloat,  0, 0,  64,  32,  32, 16, 16,  4, 1, 1, 2, 1, 0, 0, 1, 0, 0, { 1, 4, 2, 1}, {  1,  8,  1, 32}, { 1, 4, 1, 1}, {  1,  8,  1, 32}},
        {"fwd", "nhwc", miopenFloat,  0, 1,  64,  16,  32, 16, 16,  4, 1, 1, 2, 1, 0, 0, 0, 0, 0, { 1, 4, 4, 1}, {  1,  8,  1, 16}, { 1, 4, 1, 1}, {  1,  8,  1, 16}},
        {"fwd", "nhwc", miopenFloat,  0, 0,  64,  16,  32, 16, 16,  4, 1, 1, 2, 1, 0, 0, 0, 0, 0, { 1, 4, 4, 1}, {  1,  8,  1, 16}, { 1, 4, 1, 1}, {  1,  8,  1, 16}},
        {"fwd", "nhwc", miopenFloat,  0, 1,  64,  16,  32, 16, 16,  4, 1, 1, 2, 1, 0, 0, 1, 0, 0, { 1, 4, 4, 1}, {  1,  8,  1, 16}, { 1, 4, 1, 1}, {  1,  8,  1, 16}},
        {"fwd", "nhwc", miopenFloat,  0, 0,  64,  16,  32, 16, 16,  4, 1, 1, 2, 1, 0, 0, 1, 0, 0, { 1, 4, 4, 1}, {  1,  8,  1, 16}, { 1, 4, 1, 1}, {  1,  8,  1, 16}},
        {"fwd", "nhwc", miopenFloat,  0, 1,  32,  64,  32, 16, 16,  4, 1, 1, 1, 2, 0, 0, 0, 0, 0, { 1, 4, 1, 1}, {  1,  8,  1, 32}, { 1, 4, 2, 1}, {  1,  8,  1, 32}},
        {"fwd", "nhwc", miopenFloat,  0, 0,  32,  64,  32, 16, 16,  4, 1, 1, 1, 2, 0, 0, 0, 0, 0, { 1, 4, 1, 1}, {  1,  8,  1, 32}, { 1, 4, 2, 1}, {  1,  8,  1, 32}},
        {"fwd", "nhwc", miopenFloat,  0, 1,  32,  64,  32, 16, 16,  4, 1, 1, 1, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 1}, {  1,  8,  1, 32}, { 1, 4, 2, 1}, {  1,  8,  1, 32}},
        {"fwd", "nhwc", miopenFloat,  0, 0,  32,  64,  32, 16, 16,  4, 1, 1, 1, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 1}, {  1,  8,  1, 32}, { 1, 4, 2, 1}, {  1,  8,  1, 32}},
        {"fwd", "nhwc", miopenFloat,  0, 1,  16,  64,  32, 16, 16,  4, 1, 1, 1, 2, 0, 0, 0, 0, 0, { 1, 4, 1, 1}, {  1,  8,  1, 16}, { 1, 4, 4, 1}, {  1,  8,  1, 16}},
        {"fwd", "nhwc", miopenFloat,  0, 0,  16,  64,  32, 16, 16,  4, 1, 1, 1, 2, 0, 0, 0, 0, 0, { 1, 4, 1, 1}, {  1,  8,  1, 16}, { 1, 4, 4, 1}, {  1,  8,  1, 16}},
        {"fwd", "nhwc", miopenFloat,  0, 1,  16,  64,  32, 16, 16,  4, 1, 1, 1, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 1}, {  1,  8,  1, 16}, { 1, 4, 4, 1}, {  1,  8,  1, 16}},
        {"fwd", "nhwc", miopenFloat,  0, 0,  16,  64,  32, 16, 16,  4, 1, 1, 1, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 1}, {  1,  8,  1, 16}, { 1, 4, 4, 1}, {  1,  8,  1, 16}},

        {"fwd", "nhwc", miopenHalf,  0, 1, 256, 128,  32, 32, 32,  8, 2, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 8, 4, 1}, {  1,  4,  1, 64}, { 1, 8, 2, 1}, {  1,  4,  1, 64}},
        {"fwd", "nhwc", miopenHalf,  0, 0, 256, 128,  32, 32, 32,  8, 2, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 8, 4, 1}, {  1,  4,  1, 64}, { 1, 8, 2, 1}, {  1,  4,  1, 64}},
        {"fwd", "nhwc", miopenHalf,  0, 1, 256, 128,  32, 32, 32,  8, 2, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 8, 4, 1}, {  1,  4,  1, 64}, { 1, 8, 2, 1}, {  1,  4,  1, 64}},
        {"fwd", "nhwc", miopenHalf,  0, 1, 256, 128,  32, 32, 32,  8, 2, 1, 2, 2, 0, 1, 1, 0, 0, { 1, 8, 4, 1}, {  1,  4,  1, 64}, { 1, 8, 2, 1}, {  1,  4,  1, 64}},
        {"fwd", "nhwc", miopenHalf,  0, 0, 256, 128,  32, 32, 32,  8, 2, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 8, 4, 1}, {  1,  4,  1, 64}, { 1, 8, 2, 1}, {  1,  4,  1, 64}},
        {"fwd", "nhwc", miopenHalf,  0, 0, 256, 128,  32, 32, 32,  8, 2, 1, 2, 2, 0, 1, 1, 0, 0, { 1, 8, 4, 1}, {  1,  4,  1, 64}, { 1, 8, 2, 1}, {  1,  4,  1, 64}},
        {"fwd", "nhwc", miopenHalf,  0, 1, 256, 128,  16, 64, 32,  4, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 8, 2, 1}, {  1,  2,  1,128}, { 1, 8, 1, 1}, {  1,  2,  1,128}},
        {"fwd", "nhwc", miopenHalf,  0, 0, 256, 128,  16, 64, 32,  4, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 8, 2, 1}, {  1,  2,  1,128}, { 1, 8, 1, 1}, {  1,  2,  1,128}},
        {"fwd", "nhwc", miopenHalf,  0, 1, 256, 128,  16, 64, 32,  4, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 8, 2, 1}, {  1,  2,  1,128}, { 1, 8, 1, 1}, {  1,  2,  1,128}},
        {"fwd", "nhwc", miopenHalf,  0, 1, 256, 128,  16, 64, 32,  4, 1, 1, 2, 2, 0, 1, 1, 0, 0, { 1, 8, 2, 1}, {  1,  2,  1,128}, { 1, 8, 1, 1}, {  1,  2,  1,128}},
        {"fwd", "nhwc", miopenHalf,  0, 0, 256, 128,  16, 64, 32,  4, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 8, 2, 1}, {  1,  2,  1,128}, { 1, 8, 1, 1}, {  1,  2,  1,128}},
        {"fwd", "nhwc", miopenHalf,  0, 0, 256, 128,  16, 64, 32,  4, 1, 1, 2, 2, 0, 1, 1, 0, 0, { 1, 8, 2, 1}, {  1,  2,  1,128}, { 1, 8, 1, 1}, {  1,  2,  1,128}},
        {"fwd", "nhwc", miopenHalf,  0, 1, 256,  64,  32, 32, 32,  8, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 8, 4, 1}, {  1,  4,  1, 64}, { 1, 8, 1, 1}, {  1,  4,  1, 64}},
        {"fwd", "nhwc", miopenHalf,  0, 0, 256,  64,  32, 32, 32,  8, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 8, 4, 1}, {  1,  4,  1, 64}, { 1, 8, 1, 1}, {  1,  4,  1, 64}},
        {"fwd", "nhwc", miopenHalf,  0, 1, 256,  64,  32, 32, 32,  8, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 8, 4, 1}, {  1,  4,  1, 64}, { 1, 8, 1, 1}, {  1,  4,  1, 64}},
        {"fwd", "nhwc", miopenHalf,  0, 1, 256,  64,  32, 32, 32,  8, 1, 1, 2, 2, 0, 1, 1, 0, 0, { 1, 8, 4, 1}, {  1,  4,  1, 64}, { 1, 8, 1, 1}, {  1,  4,  1, 64}},
        {"fwd", "nhwc", miopenHalf,  0, 0, 256,  64,  32, 32, 32,  8, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 8, 4, 1}, {  1,  4,  1, 64}, { 1, 8, 1, 1}, {  1,  4,  1, 64}},
        {"fwd", "nhwc", miopenHalf,  0, 0, 256,  64,  32, 32, 32,  8, 1, 1, 2, 2, 0, 1, 1, 0, 0, { 1, 8, 4, 1}, {  1,  4,  1, 64}, { 1, 8, 1, 1}, {  1,  4,  1, 64}},
        {"fwd", "nhwc", miopenHalf,  0, 1, 256,  64,  16, 64, 32,  4, 1, 1, 1, 2, 0, 0, 0, 0, 0, { 1, 4, 4, 1}, {  1,  4,  1, 64}, { 1, 4, 1, 1}, {  1,  4,  1, 64}},
        {"fwd", "nhwc", miopenHalf,  0, 0, 256,  64,  16, 64, 32,  4, 1, 1, 1, 2, 0, 0, 0, 0, 0, { 1, 4, 4, 1}, {  1,  4,  1, 64}, { 1, 4, 1, 1}, {  1,  4,  1, 64}},
        {"fwd", "nhwc", miopenHalf,  0, 1, 256,  64,  16, 64, 32,  4, 1, 1, 1, 2, 0, 0, 1, 0, 0, { 1, 4, 4, 1}, {  1,  4,  1, 64}, { 1, 4, 1, 1}, {  1,  4,  1, 64}},
        {"fwd", "nhwc", miopenHalf,  0, 1, 256,  64,  16, 64, 32,  4, 1, 1, 1, 2, 0, 1, 1, 0, 0, { 1, 4, 4, 1}, {  1,  4,  1, 64}, { 1, 4, 1, 1}, {  1,  4,  1, 64}},
        {"fwd", "nhwc", miopenHalf,  0, 0, 256,  64,  16, 64, 32,  4, 1, 1, 1, 2, 0, 0, 1, 0, 0, { 1, 4, 4, 1}, {  1,  4,  1, 64}, { 1, 4, 1, 1}, {  1,  4,  1, 64}},
        {"fwd", "nhwc", miopenHalf,  0, 0, 256,  64,  16, 64, 32,  4, 1, 1, 1, 2, 0, 1, 1, 0, 0, { 1, 4, 4, 1}, {  1,  4,  1, 64}, { 1, 4, 1, 1}, {  1,  4,  1, 64}},
        {"fwd", "nhwc", miopenHalf,  0, 1, 256,  64,   8, 64, 16,  4, 1, 1, 2, 2, 0, 0, 0, 1, 0, { 1, 1, 8, 1}, {  1,  8,  1, 32}, { 1, 1, 2, 1}, {  1,  8,  1, 32}},
        {"fwd", "nhwc", miopenHalf,  0, 1, 256,  32,  32, 64, 16,  4, 1, 1, 2, 1, 0, 0, 0, 0, 0, { 1, 4, 8, 1}, {  1,  8,  1, 32}, { 1, 4, 1, 1}, {  1,  8,  1, 32}},
        {"fwd", "nhwc", miopenHalf,  0, 0, 256,  32,  32, 64, 16,  4, 1, 1, 2, 1, 0, 0, 0, 0, 0, { 1, 4, 8, 1}, {  1,  8,  1, 32}, { 1, 4, 1, 1}, {  1,  8,  1, 32}},
        {"fwd", "nhwc", miopenHalf,  0, 1, 256,  32,  32, 64, 16,  4, 1, 1, 2, 1, 0, 0, 1, 0, 0, { 1, 4, 8, 1}, {  1,  8,  1, 32}, { 1, 4, 1, 1}, {  1,  8,  1, 32}},
        {"fwd", "nhwc", miopenHalf,  0, 1, 256,  32,  32, 64, 16,  4, 1, 1, 2, 1, 0, 1, 1, 0, 0, { 1, 4, 8, 1}, {  1,  8,  1, 32}, { 1, 4, 1, 1}, {  1,  8,  1, 32}},
        {"fwd", "nhwc", miopenHalf,  0, 0, 256,  32,  32, 64, 16,  4, 1, 1, 2, 1, 0, 0, 1, 0, 0, { 1, 4, 8, 1}, {  1,  8,  1, 32}, { 1, 4, 1, 1}, {  1,  8,  1, 32}},
        {"fwd", "nhwc", miopenHalf,  0, 0, 256,  32,  32, 64, 16,  4, 1, 1, 2, 1, 0, 1, 1, 0, 0, { 1, 4, 8, 1}, {  1,  8,  1, 32}, { 1, 4, 1, 1}, {  1,  8,  1, 32}},
        {"fwd", "nhwc", miopenHalf,  0, 1, 256,  32,   8, 64, 16,  4, 1, 1, 2, 1, 0, 0, 0, 1, 0, { 1, 1, 8, 1}, {  1,  8,  1, 32}, { 1, 1, 1, 1}, {  1,  8,  1, 32}},
        {"fwd", "nhwc", miopenHalf,  0, 1, 128, 256,  32, 32, 32,  8, 1, 2, 2, 2, 0, 0, 0, 0, 0, { 1, 8, 2, 1}, {  1,  4,  1, 64}, { 1, 8, 4, 1}, {  1,  4,  1, 64}},
        {"fwd", "nhwc", miopenHalf,  0, 0, 128, 256,  32, 32, 32,  8, 1, 2, 2, 2, 0, 0, 0, 0, 0, { 1, 8, 2, 1}, {  1,  4,  1, 64}, { 1, 8, 4, 1}, {  1,  4,  1, 64}},
        {"fwd", "nhwc", miopenHalf,  0, 1, 128, 256,  32, 32, 32,  8, 1, 2, 2, 2, 0, 0, 1, 0, 0, { 1, 8, 2, 1}, {  1,  4,  1, 64}, { 1, 8, 4, 1}, {  1,  4,  1, 64}},
        {"fwd", "nhwc", miopenHalf,  0, 1, 128, 256,  32, 32, 32,  8, 1, 2, 2, 2, 0, 1, 1, 0, 0, { 1, 8, 2, 1}, {  1,  4,  1, 64}, { 1, 8, 4, 1}, {  1,  4,  1, 64}},
        {"fwd", "nhwc", miopenHalf,  0, 0, 128, 256,  32, 32, 32,  8, 1, 2, 2, 2, 0, 0, 1, 0, 0, { 1, 8, 2, 1}, {  1,  4,  1, 64}, { 1, 8, 4, 1}, {  1,  4,  1, 64}},
        {"fwd", "nhwc", miopenHalf,  0, 0, 128, 256,  32, 32, 32,  8, 1, 2, 2, 2, 0, 1, 1, 0, 0, { 1, 8, 2, 1}, {  1,  4,  1, 64}, { 1, 8, 4, 1}, {  1,  4,  1, 64}},
        {"fwd", "nhwc", miopenHalf,  0, 1, 128, 128,  32, 32, 32,  8, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 8, 2, 1}, {  1,  4,  1, 64}, { 1, 8, 2, 1}, {  1,  4,  1, 64}},
        {"fwd", "nhwc", miopenHalf,  0, 0, 128, 128,  32, 32, 32,  8, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 8, 2, 1}, {  1,  4,  1, 64}, { 1, 8, 2, 1}, {  1,  4,  1, 64}},
        {"fwd", "nhwc", miopenHalf,  0, 1, 128, 128,  32, 32, 32,  8, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 8, 2, 1}, {  1,  4,  1, 64}, { 1, 8, 2, 1}, {  1,  4,  1, 64}},
        {"fwd", "nhwc", miopenHalf,  0, 1, 128, 128,  32, 32, 32,  8, 1, 1, 2, 2, 0, 1, 1, 0, 0, { 1, 8, 2, 1}, {  1,  4,  1, 64}, { 1, 8, 2, 1}, {  1,  4,  1, 64}},
        {"fwd", "nhwc", miopenHalf,  0, 0, 128, 128,  32, 32, 32,  8, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 8, 2, 1}, {  1,  4,  1, 64}, { 1, 8, 2, 1}, {  1,  4,  1, 64}},
        {"fwd", "nhwc", miopenHalf,  0, 0, 128, 128,  32, 32, 32,  8, 1, 1, 2, 2, 0, 1, 1, 0, 0, { 1, 8, 2, 1}, {  1,  4,  1, 64}, { 1, 8, 2, 1}, {  1,  4,  1, 64}},
        {"fwd", "nhwc", miopenHalf,  0, 1, 128, 128,  16, 32, 32,  4, 1, 1, 2, 2, 0, 0, 0, 1, 0, { 1, 1, 8, 1}, {  1, 16,  1, 16}, { 1, 1, 8, 1}, {  1, 16,  1, 16}},
        {"fwd", "nhwc", miopenHalf,  0, 1, 128, 128,   8, 32, 32,  4, 1, 1, 2, 2, 0, 0, 0, 1, 0, { 1, 1, 4, 1}, {  1,  8,  1, 32}, { 1, 1, 4, 1}, {  1,  8,  1, 32}},
        {"fwd", "nhwc", miopenHalf,  0, 1, 128,  64,  32, 32, 32,  8, 1, 1, 1, 2, 0, 0, 0, 0, 0, { 1, 8, 2, 1}, {  1,  4,  1, 64}, { 1, 8, 1, 1}, {  1,  4,  1, 64}},
        {"fwd", "nhwc", miopenHalf,  0, 0, 128,  64,  32, 32, 32,  8, 1, 1, 1, 2, 0, 0, 0, 0, 0, { 1, 8, 2, 1}, {  1,  4,  1, 64}, { 1, 8, 1, 1}, {  1,  4,  1, 64}},
        {"fwd", "nhwc", miopenHalf,  0, 1, 128,  64,  32, 32, 32,  8, 1, 1, 1, 2, 0, 0, 1, 0, 0, { 1, 8, 2, 1}, {  1,  4,  1, 64}, { 1, 8, 1, 1}, {  1,  4,  1, 64}},
        {"fwd", "nhwc", miopenHalf,  0, 1, 128,  64,  32, 32, 32,  8, 1, 1, 1, 2, 0, 1, 1, 0, 0, { 1, 8, 2, 1}, {  1,  4,  1, 64}, { 1, 8, 1, 1}, {  1,  4,  1, 64}},
        {"fwd", "nhwc", miopenHalf,  0, 0, 128,  64,  32, 32, 32,  8, 1, 1, 1, 2, 0, 0, 1, 0, 0, { 1, 8, 2, 1}, {  1,  4,  1, 64}, { 1, 8, 1, 1}, {  1,  4,  1, 64}},
        {"fwd", "nhwc", miopenHalf,  0, 0, 128,  64,  32, 32, 32,  8, 1, 1, 1, 2, 0, 1, 1, 0, 0, { 1, 8, 2, 1}, {  1,  4,  1, 64}, { 1, 8, 1, 1}, {  1,  4,  1, 64}},
        {"fwd", "nhwc", miopenHalf,  0, 1, 128,  64,  32, 32, 32,  8, 1, 1, 1, 2, 0, 0, 0, 0, 1, { 1,16, 1, 1}, {  1,  2,  4, 32}, { 1, 8, 1, 1}, {  1,  4,  1, 64}},
        {"fwd", "nhwc", miopenHalf,  0, 1, 128,  64,  32, 32, 32,  8, 1, 1, 1, 2, 0, 0, 1, 0, 1, { 1,16, 1, 1}, {  1,  2,  4, 32}, { 1, 8, 1, 1}, {  1,  4,  1, 64}},
        {"fwd", "nhwc", miopenHalf,  0, 1, 128,  64,  32, 32, 32,  8, 1, 1, 1, 2, 0, 1, 1, 0, 1, { 1,16, 1, 1}, {  1,  2,  4, 32}, { 1, 8, 1, 1}, {  1,  4,  1, 64}},
        {"fwd", "nhwc", miopenHalf,  0, 0, 128,  64,  32, 32, 32,  8, 1, 1, 1, 2, 0, 0, 0, 0, 1, { 1,16, 1, 1}, {  1,  2,  4, 32}, { 1, 8, 1, 1}, {  1,  4,  1, 64}},
        {"fwd", "nhwc", miopenHalf,  0, 0, 128,  64,  32, 32, 32,  8, 1, 1, 1, 2, 0, 0, 1, 0, 1, { 1,16, 1, 1}, {  1,  2,  4, 32}, { 1, 8, 1, 1}, {  1,  4,  1, 64}},
        {"fwd", "nhwc", miopenHalf,  0, 0, 128,  64,  32, 32, 32,  8, 1, 1, 1, 2, 0, 1, 1, 0, 1, { 1,16, 1, 1}, {  1,  2,  4, 32}, { 1, 8, 1, 1}, {  1,  4,  1, 64}},
        {"fwd", "nhwc", miopenHalf,  0, 1, 128,  64,  16, 32, 32,  4, 1, 1, 2, 1, 0, 0, 0, 1, 0, { 1, 1, 8, 1}, {  1, 16,  1, 16}, { 1, 1, 4, 1}, {  1, 16,  1, 16}},
        {"fwd", "nhwc", miopenHalf,  0, 1, 128,  32,  32, 64, 16,  4, 1, 1, 1, 1, 0, 0, 0, 0, 0, { 1, 4, 4, 1}, {  1,  8,  1, 32}, { 1, 4, 1, 1}, {  1,  8,  1, 32}},
        {"fwd", "nhwc", miopenHalf,  0, 0, 128,  32,  32, 64, 16,  4, 1, 1, 1, 1, 0, 0, 0, 0, 0, { 1, 4, 4, 1}, {  1,  8,  1, 32}, { 1, 4, 1, 1}, {  1,  8,  1, 32}},
        {"fwd", "nhwc", miopenHalf,  0, 1, 128,  32,  32, 64, 16,  4, 1, 1, 1, 1, 0, 0, 1, 0, 0, { 1, 4, 4, 1}, {  1,  8,  1, 32}, { 1, 4, 1, 1}, {  1,  8,  1, 32}},
        {"fwd", "nhwc", miopenHalf,  0, 1, 128,  32,  32, 64, 16,  4, 1, 1, 1, 1, 0, 1, 1, 0, 0, { 1, 4, 4, 1}, {  1,  8,  1, 32}, { 1, 4, 1, 1}, {  1,  8,  1, 32}},
        {"fwd", "nhwc", miopenHalf,  0, 0, 128,  32,  32, 64, 16,  4, 1, 1, 1, 1, 0, 0, 1, 0, 0, { 1, 4, 4, 1}, {  1,  8,  1, 32}, { 1, 4, 1, 1}, {  1,  8,  1, 32}},
        {"fwd", "nhwc", miopenHalf,  0, 0, 128,  32,  32, 64, 16,  4, 1, 1, 1, 1, 0, 1, 1, 0, 0, { 1, 4, 4, 1}, {  1,  8,  1, 32}, { 1, 4, 1, 1}, {  1,  8,  1, 32}},
        {"fwd", "nhwc", miopenHalf,  0, 1, 128,  32,  16, 64, 16,  4, 1, 1, 1, 1, 0, 0, 0, 1, 0, { 1, 1, 8, 1}, {  1, 16,  1, 16}, { 1, 1, 2, 1}, {  1, 16,  1, 16}},
        {"fwd", "nhwc", miopenHalf,  0, 1,  64, 256,  32, 32, 32,  8, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 8, 1, 1}, {  1,  4,  1, 64}, { 1, 8, 4, 1}, {  1,  4,  1, 64}},
        {"fwd", "nhwc", miopenHalf,  0, 0,  64, 256,  32, 32, 32,  8, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 8, 1, 1}, {  1,  4,  1, 64}, { 1, 8, 4, 1}, {  1,  4,  1, 64}},
        {"fwd", "nhwc", miopenHalf,  0, 1,  64, 256,  32, 32, 32,  8, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 8, 1, 1}, {  1,  4,  1, 64}, { 1, 8, 4, 1}, {  1,  4,  1, 64}},
        {"fwd", "nhwc", miopenHalf,  0, 1,  64, 256,  32, 32, 32,  8, 1, 1, 2, 2, 0, 1, 1, 0, 0, { 1, 8, 1, 1}, {  1,  4,  1, 64}, { 1, 8, 4, 1}, {  1,  4,  1, 64}},
        {"fwd", "nhwc", miopenHalf,  0, 0,  64, 256,  32, 32, 32,  8, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 8, 1, 1}, {  1,  4,  1, 64}, { 1, 8, 4, 1}, {  1,  4,  1, 64}},
        {"fwd", "nhwc", miopenHalf,  0, 0,  64, 256,  32, 32, 32,  8, 1, 1, 2, 2, 0, 1, 1, 0, 0, { 1, 8, 1, 1}, {  1,  4,  1, 64}, { 1, 8, 4, 1}, {  1,  4,  1, 64}},
        {"fwd", "nhwc", miopenHalf,  0, 1,  64, 128,  32, 32, 32,  8, 1, 1, 2, 1, 0, 0, 0, 0, 0, { 1, 8, 1, 1}, {  1,  4,  1, 64}, { 1, 8, 2, 1}, {  1,  4,  1, 64}},
        {"fwd", "nhwc", miopenHalf,  0, 0,  64, 128,  32, 32, 32,  8, 1, 1, 2, 1, 0, 0, 0, 0, 0, { 1, 8, 1, 1}, {  1,  4,  1, 64}, { 1, 8, 2, 1}, {  1,  4,  1, 64}},
        {"fwd", "nhwc", miopenHalf,  0, 1,  64, 128,  32, 32, 32,  8, 1, 1, 2, 1, 0, 0, 1, 0, 0, { 1, 8, 1, 1}, {  1,  4,  1, 64}, { 1, 8, 2, 1}, {  1,  4,  1, 64}},
        {"fwd", "nhwc", miopenHalf,  0, 1,  64, 128,  32, 32, 32,  8, 1, 1, 2, 1, 0, 1, 1, 0, 0, { 1, 8, 1, 1}, {  1,  4,  1, 64}, { 1, 8, 2, 1}, {  1,  4,  1, 64}},
        {"fwd", "nhwc", miopenHalf,  0, 0,  64, 128,  32, 32, 32,  8, 1, 1, 2, 1, 0, 0, 1, 0, 0, { 1, 8, 1, 1}, {  1,  4,  1, 64}, { 1, 8, 2, 1}, {  1,  4,  1, 64}},
        {"fwd", "nhwc", miopenHalf,  0, 0,  64, 128,  32, 32, 32,  8, 1, 1, 2, 1, 0, 1, 1, 0, 0, { 1, 8, 1, 1}, {  1,  4,  1, 64}, { 1, 8, 2, 1}, {  1,  4,  1, 64}},
        {"fwd", "nhwc", miopenHalf,  0, 1,  64,  64,  64, 16, 16, 16, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 8, 2, 1}, {  1,  8,  1, 32}, { 1, 8, 2, 1}, {  1,  8,  1, 32}},
        {"fwd", "nhwc", miopenHalf,  0, 0,  64,  64,  64, 16, 16, 16, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 8, 2, 1}, {  1,  8,  1, 32}, { 1, 8, 2, 1}, {  1,  8,  1, 32}},
        {"fwd", "nhwc", miopenHalf,  0, 1,  64,  64,  64, 16, 16, 16, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 8, 2, 1}, {  1,  8,  1, 32}, { 1, 8, 2, 1}, {  1,  8,  1, 32}},
        {"fwd", "nhwc", miopenHalf,  0, 1,  64,  64,  64, 16, 16, 16, 1, 1, 2, 2, 0, 1, 1, 0, 0, { 1, 8, 2, 1}, {  1,  8,  1, 32}, { 1, 8, 2, 1}, {  1,  8,  1, 32}},
        {"fwd", "nhwc", miopenHalf,  0, 0,  64,  64,  64, 16, 16, 16, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 8, 2, 1}, {  1,  8,  1, 32}, { 1, 8, 2, 1}, {  1,  8,  1, 32}},
        {"fwd", "nhwc", miopenHalf,  0, 0,  64,  64,  64, 16, 16, 16, 1, 1, 2, 2, 0, 1, 1, 0, 0, { 1, 8, 2, 1}, {  1,  8,  1, 32}, { 1, 8, 2, 1}, {  1,  8,  1, 32}},
        {"fwd", "nhwc", miopenHalf,  0, 1,  64,  64,  16, 16, 16,  4, 1, 1, 2, 2, 0, 0, 0, 1, 0, { 1, 1, 4, 1}, {  1, 16,  1, 16}, { 1, 1, 4, 1}, {  1, 16,  1, 16}},
        {"fwd", "nhwc", miopenHalf,  0, 1,  64,  32,  32, 64, 16,  4, 1, 1, 1, 1, 0, 0, 0, 0, 0, { 1, 8, 2, 1}, {  1,  4,  1, 32}, { 1, 8, 1, 1}, {  1,  4,  1, 32}},
        {"fwd", "nhwc", miopenHalf,  0, 0,  64,  32,  32, 64, 16,  4, 1, 1, 1, 1, 0, 0, 0, 0, 0, { 1, 8, 2, 1}, {  1,  4,  1, 32}, { 1, 8, 1, 1}, {  1,  4,  1, 32}},
        {"fwd", "nhwc", miopenHalf,  0, 1,  64,  32,  32, 64, 16,  4, 1, 1, 1, 1, 0, 0, 1, 0, 0, { 1, 8, 2, 1}, {  1,  4,  1, 32}, { 1, 8, 1, 1}, {  1,  4,  1, 32}},
        {"fwd", "nhwc", miopenHalf,  0, 1,  64,  32,  32, 64, 16,  4, 1, 1, 1, 1, 0, 1, 1, 0, 0, { 1, 8, 2, 1}, {  1,  4,  1, 32}, { 1, 8, 1, 1}, {  1,  4,  1, 32}},
        {"fwd", "nhwc", miopenHalf,  0, 0,  64,  32,  32, 64, 16,  4, 1, 1, 1, 1, 0, 0, 1, 0, 0, { 1, 8, 2, 1}, {  1,  4,  1, 32}, { 1, 8, 1, 1}, {  1,  4,  1, 32}},
        {"fwd", "nhwc", miopenHalf,  0, 0,  64,  32,  32, 64, 16,  4, 1, 1, 1, 1, 0, 1, 1, 0, 0, { 1, 8, 2, 1}, {  1,  4,  1, 32}, { 1, 8, 1, 1}, {  1,  4,  1, 32}},
        {"fwd", "nhwc", miopenHalf,  0, 1,  64,  32,  16, 64, 16,  4, 1, 1, 1, 1, 0, 0, 0, 1, 0, { 1, 1, 8, 1}, {  1, 16,  1,  8}, { 1, 1, 4, 1}, {  1, 16,  1,  8}},
        {"fwd", "nhwc", miopenHalf,  0, 1,  32, 256,  32, 16, 64,  4, 1, 1, 1, 2, 0, 0, 0, 0, 0, { 1, 4, 1, 1}, {  1,  8,  1, 32}, { 1, 4, 8, 1}, {  1,  8,  1, 32}},
        {"fwd", "nhwc", miopenHalf,  0, 0,  32, 256,  32, 16, 64,  4, 1, 1, 1, 2, 0, 0, 0, 0, 0, { 1, 4, 1, 1}, {  1,  8,  1, 32}, { 1, 4, 8, 1}, {  1,  8,  1, 32}},
        {"fwd", "nhwc", miopenHalf,  0, 1,  32, 256,  32, 16, 64,  4, 1, 1, 1, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 1}, {  1,  8,  1, 32}, { 1, 4, 8, 1}, {  1,  8,  1, 32}},
        {"fwd", "nhwc", miopenHalf,  0, 1,  32, 256,  32, 16, 64,  4, 1, 1, 1, 2, 0, 1, 1, 0, 0, { 1, 4, 1, 1}, {  1,  8,  1, 32}, { 1, 4, 8, 1}, {  1,  8,  1, 32}},
        {"fwd", "nhwc", miopenHalf,  0, 0,  32, 256,  32, 16, 64,  4, 1, 1, 1, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 1}, {  1,  8,  1, 32}, { 1, 4, 8, 1}, {  1,  8,  1, 32}},
        {"fwd", "nhwc", miopenHalf,  0, 0,  32, 256,  32, 16, 64,  4, 1, 1, 1, 2, 0, 1, 1, 0, 0, { 1, 4, 1, 1}, {  1,  8,  1, 32}, { 1, 4, 8, 1}, {  1,  8,  1, 32}},
        {"fwd", "nhwc", miopenHalf,  0, 1,  32, 128,  32, 16, 64,  4, 1, 1, 1, 1, 0, 0, 0, 0, 0, { 1, 4, 1, 1}, {  1,  8,  1, 32}, { 1, 4, 4, 1}, {  1,  8,  1, 32}},
        {"fwd", "nhwc", miopenHalf,  0, 0,  32, 128,  32, 16, 64,  4, 1, 1, 1, 1, 0, 0, 0, 0, 0, { 1, 4, 1, 1}, {  1,  8,  1, 32}, { 1, 4, 4, 1}, {  1,  8,  1, 32}},
        {"fwd", "nhwc", miopenHalf,  0, 1,  32, 128,  32, 16, 64,  4, 1, 1, 1, 1, 0, 0, 1, 0, 0, { 1, 4, 1, 1}, {  1,  8,  1, 32}, { 1, 4, 4, 1}, {  1,  8,  1, 32}},
        {"fwd", "nhwc", miopenHalf,  0, 1,  32, 128,  32, 16, 64,  4, 1, 1, 1, 1, 0, 1, 1, 0, 0, { 1, 4, 1, 1}, {  1,  8,  1, 32}, { 1, 4, 4, 1}, {  1,  8,  1, 32}},
        {"fwd", "nhwc", miopenHalf,  0, 0,  32, 128,  32, 16, 64,  4, 1, 1, 1, 1, 0, 0, 1, 0, 0, { 1, 4, 1, 1}, {  1,  8,  1, 32}, { 1, 4, 4, 1}, {  1,  8,  1, 32}},
        {"fwd", "nhwc", miopenHalf,  0, 0,  32, 128,  32, 16, 64,  4, 1, 1, 1, 1, 0, 1, 1, 0, 0, { 1, 4, 1, 1}, {  1,  8,  1, 32}, { 1, 4, 4, 1}, {  1,  8,  1, 32}},
        {"fwd", "nhwc", miopenHalf,  0, 1,  32,  64,  32, 16, 64,  4, 1, 1, 1, 1, 0, 0, 0, 0, 0, { 1, 8, 1, 1}, {  1,  4,  1, 32}, { 1, 8, 2, 1}, {  1,  4,  1, 32}},
        {"fwd", "nhwc", miopenHalf,  0, 0,  32,  64,  32, 16, 64,  4, 1, 1, 1, 1, 0, 0, 0, 0, 0, { 1, 8, 1, 1}, {  1,  4,  1, 32}, { 1, 8, 2, 1}, {  1,  4,  1, 32}},
        {"fwd", "nhwc", miopenHalf,  0, 1,  32,  64,  32, 16, 64,  4, 1, 1, 1, 1, 0, 0, 1, 0, 0, { 1, 8, 1, 1}, {  1,  4,  1, 32}, { 1, 8, 2, 1}, {  1,  4,  1, 32}},
        {"fwd", "nhwc", miopenHalf,  0, 1,  32,  64,  32, 16, 64,  4, 1, 1, 1, 1, 0, 1, 1, 0, 0, { 1, 8, 1, 1}, {  1,  4,  1, 32}, { 1, 8, 2, 1}, {  1,  4,  1, 32}},
        {"fwd", "nhwc", miopenHalf,  0, 0,  32,  64,  32, 16, 64,  4, 1, 1, 1, 1, 0, 0, 1, 0, 0, { 1, 8, 1, 1}, {  1,  4,  1, 32}, { 1, 8, 2, 1}, {  1,  4,  1, 32}},
        {"fwd", "nhwc", miopenHalf,  0, 0,  32,  64,  32, 16, 64,  4, 1, 1, 1, 1, 0, 1, 1, 0, 0, { 1, 8, 1, 1}, {  1,  4,  1, 32}, { 1, 8, 2, 1}, {  1,  4,  1, 32}},

        {"fwd", "nhwc", miopenBFloat16,  0, 1, 256, 128,  32, 32, 32,  8, 2, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 8, 4, 1}, {  1,  4,  1, 64}, { 1, 8, 2, 1}, {  1,  4,  1, 64}},
        {"fwd", "nhwc", miopenBFloat16,  0, 0, 256, 128,  32, 32, 32,  8, 2, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 8, 4, 1}, {  1,  4,  1, 64}, { 1, 8, 2, 1}, {  1,  4,  1, 64}},
        {"fwd", "nhwc", miopenBFloat16,  0, 1, 256, 128,  32, 32, 32,  8, 2, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 8, 4, 1}, {  1,  4,  1, 64}, { 1, 8, 2, 1}, {  1,  4,  1, 64}},
        {"fwd", "nhwc", miopenBFloat16,  0, 0, 256, 128,  32, 32, 32,  8, 2, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 8, 4, 1}, {  1,  4,  1, 64}, { 1, 8, 2, 1}, {  1,  4,  1, 64}},
        {"fwd", "nhwc", miopenBFloat16,  0, 1, 256, 128,  16, 64, 32,  4, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 8, 2, 1}, {  1,  2,  1,128}, { 1, 8, 1, 1}, {  1,  2,  1,128}},
        {"fwd", "nhwc", miopenBFloat16,  0, 0, 256, 128,  16, 64, 32,  4, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 8, 2, 1}, {  1,  2,  1,128}, { 1, 8, 1, 1}, {  1,  2,  1,128}},
        {"fwd", "nhwc", miopenBFloat16,  0, 1, 256, 128,  16, 64, 32,  4, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 8, 2, 1}, {  1,  2,  1,128}, { 1, 8, 1, 1}, {  1,  2,  1,128}},
        {"fwd", "nhwc", miopenBFloat16,  0, 0, 256, 128,  16, 64, 32,  4, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 8, 2, 1}, {  1,  2,  1,128}, { 1, 8, 1, 1}, {  1,  2,  1,128}},
        {"fwd", "nhwc", miopenBFloat16,  0, 1, 256,  64,  32, 32, 32,  8, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 8, 4, 1}, {  1,  4,  1, 64}, { 1, 8, 1, 1}, {  1,  4,  1, 64}},
        {"fwd", "nhwc", miopenBFloat16,  0, 0, 256,  64,  32, 32, 32,  8, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 8, 4, 1}, {  1,  4,  1, 64}, { 1, 8, 1, 1}, {  1,  4,  1, 64}},
        {"fwd", "nhwc", miopenBFloat16,  0, 1, 256,  64,  32, 32, 32,  8, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 8, 4, 1}, {  1,  4,  1, 64}, { 1, 8, 1, 1}, {  1,  4,  1, 64}},
        {"fwd", "nhwc", miopenBFloat16,  0, 0, 256,  64,  32, 32, 32,  8, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 8, 4, 1}, {  1,  4,  1, 64}, { 1, 8, 1, 1}, {  1,  4,  1, 64}},
        {"fwd", "nhwc", miopenBFloat16,  0, 1, 256,  64,  16, 64, 32,  4, 1, 1, 1, 2, 0, 0, 0, 0, 0, { 1, 4, 4, 1}, {  1,  4,  1, 64}, { 1, 4, 1, 1}, {  1,  4,  1, 64}},
        {"fwd", "nhwc", miopenBFloat16,  0, 0, 256,  64,  16, 64, 32,  4, 1, 1, 1, 2, 0, 0, 0, 0, 0, { 1, 4, 4, 1}, {  1,  4,  1, 64}, { 1, 4, 1, 1}, {  1,  4,  1, 64}},
        {"fwd", "nhwc", miopenBFloat16,  0, 1, 256,  64,  16, 64, 32,  4, 1, 1, 1, 2, 0, 0, 1, 0, 0, { 1, 4, 4, 1}, {  1,  4,  1, 64}, { 1, 4, 1, 1}, {  1,  4,  1, 64}},
        {"fwd", "nhwc", miopenBFloat16,  0, 0, 256,  64,  16, 64, 32,  4, 1, 1, 1, 2, 0, 0, 1, 0, 0, { 1, 4, 4, 1}, {  1,  4,  1, 64}, { 1, 4, 1, 1}, {  1,  4,  1, 64}},
        {"fwd", "nhwc", miopenBFloat16,  0, 1, 256,  64,   8, 64, 16,  4, 1, 1, 2, 2, 0, 0, 0, 1, 0, { 1, 1, 8, 1}, {  1,  8,  1, 32}, { 1, 1, 2, 1}, {  1,  8,  1, 32}},
        {"fwd", "nhwc", miopenBFloat16,  0, 1, 256,  32,  32, 64, 16,  4, 1, 1, 2, 1, 0, 0, 0, 0, 0, { 1, 4, 8, 1}, {  1,  8,  1, 32}, { 1, 4, 1, 1}, {  1,  8,  1, 32}},
        {"fwd", "nhwc", miopenBFloat16,  0, 0, 256,  32,  32, 64, 16,  4, 1, 1, 2, 1, 0, 0, 0, 0, 0, { 1, 4, 8, 1}, {  1,  8,  1, 32}, { 1, 4, 1, 1}, {  1,  8,  1, 32}},
        {"fwd", "nhwc", miopenBFloat16,  0, 1, 256,  32,  32, 64, 16,  4, 1, 1, 2, 1, 0, 0, 1, 0, 0, { 1, 4, 8, 1}, {  1,  8,  1, 32}, { 1, 4, 1, 1}, {  1,  8,  1, 32}},
        {"fwd", "nhwc", miopenBFloat16,  0, 0, 256,  32,  32, 64, 16,  4, 1, 1, 2, 1, 0, 0, 1, 0, 0, { 1, 4, 8, 1}, {  1,  8,  1, 32}, { 1, 4, 1, 1}, {  1,  8,  1, 32}},
        {"fwd", "nhwc", miopenBFloat16,  0, 1, 256,  32,   8, 64, 16,  4, 1, 1, 2, 1, 0, 0, 0, 1, 0, { 1, 1, 8, 1}, {  1,  8,  1, 32}, { 1, 1, 1, 1}, {  1,  8,  1, 32}},
        {"fwd", "nhwc", miopenBFloat16,  0, 1, 128, 256,  32, 32, 32,  8, 1, 2, 2, 2, 0, 0, 0, 0, 0, { 1, 8, 2, 1}, {  1,  4,  1, 64}, { 1, 8, 4, 1}, {  1,  4,  1, 64}},
        {"fwd", "nhwc", miopenBFloat16,  0, 0, 128, 256,  32, 32, 32,  8, 1, 2, 2, 2, 0, 0, 0, 0, 0, { 1, 8, 2, 1}, {  1,  4,  1, 64}, { 1, 8, 4, 1}, {  1,  4,  1, 64}},
        {"fwd", "nhwc", miopenBFloat16,  0, 1, 128, 256,  32, 32, 32,  8, 1, 2, 2, 2, 0, 0, 1, 0, 0, { 1, 8, 2, 1}, {  1,  4,  1, 64}, { 1, 8, 4, 1}, {  1,  4,  1, 64}},
        {"fwd", "nhwc", miopenBFloat16,  0, 0, 128, 256,  32, 32, 32,  8, 1, 2, 2, 2, 0, 0, 1, 0, 0, { 1, 8, 2, 1}, {  1,  4,  1, 64}, { 1, 8, 4, 1}, {  1,  4,  1, 64}},
        {"fwd", "nhwc", miopenBFloat16,  0, 1, 128, 128,  32, 32, 32,  8, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 8, 2, 1}, {  1,  4,  1, 64}, { 1, 8, 2, 1}, {  1,  4,  1, 64}},
        {"fwd", "nhwc", miopenBFloat16,  0, 0, 128, 128,  32, 32, 32,  8, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 8, 2, 1}, {  1,  4,  1, 64}, { 1, 8, 2, 1}, {  1,  4,  1, 64}},
        {"fwd", "nhwc", miopenBFloat16,  0, 1, 128, 128,  32, 32, 32,  8, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 8, 2, 1}, {  1,  4,  1, 64}, { 1, 8, 2, 1}, {  1,  4,  1, 64}},
        {"fwd", "nhwc", miopenBFloat16,  0, 0, 128, 128,  32, 32, 32,  8, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 8, 2, 1}, {  1,  4,  1, 64}, { 1, 8, 2, 1}, {  1,  4,  1, 64}},
        {"fwd", "nhwc", miopenBFloat16,  0, 1, 128, 128,  16, 32, 32,  4, 1, 1, 2, 2, 0, 0, 0, 1, 0, { 1, 1, 8, 1}, {  1, 16,  1, 16}, { 1, 1, 8, 1}, {  1, 16,  1, 16}},
        {"fwd", "nhwc", miopenBFloat16,  0, 1, 128, 128,   8, 32, 32,  4, 1, 1, 2, 2, 0, 0, 0, 1, 0, { 1, 1, 4, 1}, {  1,  8,  1, 32}, { 1, 1, 4, 1}, {  1,  8,  1, 32}},
        {"fwd", "nhwc", miopenBFloat16,  0, 1, 128,  64,  32, 32, 32,  8, 1, 1, 1, 2, 0, 0, 0, 0, 0, { 1, 8, 2, 1}, {  1,  4,  1, 64}, { 1, 8, 1, 1}, {  1,  4,  1, 64}},
        {"fwd", "nhwc", miopenBFloat16,  0, 0, 128,  64,  32, 32, 32,  8, 1, 1, 1, 2, 0, 0, 0, 0, 0, { 1, 8, 2, 1}, {  1,  4,  1, 64}, { 1, 8, 1, 1}, {  1,  4,  1, 64}},
        {"fwd", "nhwc", miopenBFloat16,  0, 1, 128,  64,  32, 32, 32,  8, 1, 1, 1, 2, 0, 0, 1, 0, 0, { 1, 8, 2, 1}, {  1,  4,  1, 64}, { 1, 8, 1, 1}, {  1,  4,  1, 64}},
        {"fwd", "nhwc", miopenBFloat16,  0, 0, 128,  64,  32, 32, 32,  8, 1, 1, 1, 2, 0, 0, 1, 0, 0, { 1, 8, 2, 1}, {  1,  4,  1, 64}, { 1, 8, 1, 1}, {  1,  4,  1, 64}},
        {"fwd", "nhwc", miopenBFloat16,  0, 1, 128,  64,  32, 32, 32,  8, 1, 1, 1, 2, 0, 0, 0, 0, 1, { 1,16, 1, 1}, {  1,  2,  4, 32}, { 1, 8, 1, 1}, {  1,  4,  1, 64}},
        {"fwd", "nhwc", miopenBFloat16,  0, 1, 128,  64,  32, 32, 32,  8, 1, 1, 1, 2, 0, 0, 1, 0, 1, { 1,16, 1, 1}, {  1,  2,  4, 32}, { 1, 8, 1, 1}, {  1,  4,  1, 64}},
        {"fwd", "nhwc", miopenBFloat16,  0, 0, 128,  64,  32, 32, 32,  8, 1, 1, 1, 2, 0, 0, 0, 0, 1, { 1,16, 1, 1}, {  1,  2,  4, 32}, { 1, 8, 1, 1}, {  1,  4,  1, 64}},
        {"fwd", "nhwc", miopenBFloat16,  0, 0, 128,  64,  32, 32, 32,  8, 1, 1, 1, 2, 0, 0, 1, 0, 1, { 1,16, 1, 1}, {  1,  2,  4, 32}, { 1, 8, 1, 1}, {  1,  4,  1, 64}},
        {"fwd", "nhwc", miopenBFloat16,  0, 1, 128,  64,  16, 32, 32,  4, 1, 1, 2, 1, 0, 0, 0, 1, 0, { 1, 1, 8, 1}, {  1, 16,  1, 16}, { 1, 1, 4, 1}, {  1, 16,  1, 16}},
        {"fwd", "nhwc", miopenBFloat16,  0, 1, 128,  32,  32, 64, 16,  4, 1, 1, 1, 1, 0, 0, 0, 0, 0, { 1, 4, 4, 1}, {  1,  8,  1, 32}, { 1, 4, 1, 1}, {  1,  8,  1, 32}},
        {"fwd", "nhwc", miopenBFloat16,  0, 0, 128,  32,  32, 64, 16,  4, 1, 1, 1, 1, 0, 0, 0, 0, 0, { 1, 4, 4, 1}, {  1,  8,  1, 32}, { 1, 4, 1, 1}, {  1,  8,  1, 32}},
        {"fwd", "nhwc", miopenBFloat16,  0, 1, 128,  32,  32, 64, 16,  4, 1, 1, 1, 1, 0, 0, 1, 0, 0, { 1, 4, 4, 1}, {  1,  8,  1, 32}, { 1, 4, 1, 1}, {  1,  8,  1, 32}},
        {"fwd", "nhwc", miopenBFloat16,  0, 0, 128,  32,  32, 64, 16,  4, 1, 1, 1, 1, 0, 0, 1, 0, 0, { 1, 4, 4, 1}, {  1,  8,  1, 32}, { 1, 4, 1, 1}, {  1,  8,  1, 32}},
        {"fwd", "nhwc", miopenBFloat16,  0, 1, 128,  32,  16, 64, 16,  4, 1, 1, 1, 1, 0, 0, 0, 1, 0, { 1, 1, 8, 1}, {  1, 16,  1, 16}, { 1, 1, 2, 1}, {  1, 16,  1, 16}},
        {"fwd", "nhwc", miopenBFloat16,  0, 1,  64, 256,  32, 32, 32,  8, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 8, 1, 1}, {  1,  4,  1, 64}, { 1, 8, 4, 1}, {  1,  4,  1, 64}},
        {"fwd", "nhwc", miopenBFloat16,  0, 0,  64, 256,  32, 32, 32,  8, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 8, 1, 1}, {  1,  4,  1, 64}, { 1, 8, 4, 1}, {  1,  4,  1, 64}},
        {"fwd", "nhwc", miopenBFloat16,  0, 1,  64, 256,  32, 32, 32,  8, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 8, 1, 1}, {  1,  4,  1, 64}, { 1, 8, 4, 1}, {  1,  4,  1, 64}},
        {"fwd", "nhwc", miopenBFloat16,  0, 0,  64, 256,  32, 32, 32,  8, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 8, 1, 1}, {  1,  4,  1, 64}, { 1, 8, 4, 1}, {  1,  4,  1, 64}},
        {"fwd", "nhwc", miopenBFloat16,  0, 1,  64, 128,  32, 32, 32,  8, 1, 1, 2, 1, 0, 0, 0, 0, 0, { 1, 8, 1, 1}, {  1,  4,  1, 64}, { 1, 8, 2, 1}, {  1,  4,  1, 64}},
        {"fwd", "nhwc", miopenBFloat16,  0, 0,  64, 128,  32, 32, 32,  8, 1, 1, 2, 1, 0, 0, 0, 0, 0, { 1, 8, 1, 1}, {  1,  4,  1, 64}, { 1, 8, 2, 1}, {  1,  4,  1, 64}},
        {"fwd", "nhwc", miopenBFloat16,  0, 1,  64, 128,  32, 32, 32,  8, 1, 1, 2, 1, 0, 0, 1, 0, 0, { 1, 8, 1, 1}, {  1,  4,  1, 64}, { 1, 8, 2, 1}, {  1,  4,  1, 64}},
        {"fwd", "nhwc", miopenBFloat16,  0, 0,  64, 128,  32, 32, 32,  8, 1, 1, 2, 1, 0, 0, 1, 0, 0, { 1, 8, 1, 1}, {  1,  4,  1, 64}, { 1, 8, 2, 1}, {  1,  4,  1, 64}},
        {"fwd", "nhwc", miopenBFloat16,  0, 1,  64,  64,  64, 16, 16, 16, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 8, 2, 1}, {  1,  8,  1, 32}, { 1, 8, 2, 1}, {  1,  8,  1, 32}},
        {"fwd", "nhwc", miopenBFloat16,  0, 0,  64,  64,  64, 16, 16, 16, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 8, 2, 1}, {  1,  8,  1, 32}, { 1, 8, 2, 1}, {  1,  8,  1, 32}},
        {"fwd", "nhwc", miopenBFloat16,  0, 1,  64,  64,  64, 16, 16, 16, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 8, 2, 1}, {  1,  8,  1, 32}, { 1, 8, 2, 1}, {  1,  8,  1, 32}},
        {"fwd", "nhwc", miopenBFloat16,  0, 0,  64,  64,  64, 16, 16, 16, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 8, 2, 1}, {  1,  8,  1, 32}, { 1, 8, 2, 1}, {  1,  8,  1, 32}},
        {"fwd", "nhwc", miopenBFloat16,  0, 1,  64,  64,  16, 16, 16,  4, 1, 1, 2, 2, 0, 0, 0, 1, 0, { 1, 1, 4, 1}, {  1, 16,  1, 16}, { 1, 1, 4, 1}, {  1, 16,  1, 16}},
        {"fwd", "nhwc", miopenBFloat16,  0, 1,  64,  32,  32, 64, 16,  4, 1, 1, 1, 1, 0, 0, 0, 0, 0, { 1, 8, 2, 1}, {  1,  4,  1, 32}, { 1, 8, 1, 1}, {  1,  4,  1, 32}},
        {"fwd", "nhwc", miopenBFloat16,  0, 0,  64,  32,  32, 64, 16,  4, 1, 1, 1, 1, 0, 0, 0, 0, 0, { 1, 8, 2, 1}, {  1,  4,  1, 32}, { 1, 8, 1, 1}, {  1,  4,  1, 32}},
        {"fwd", "nhwc", miopenBFloat16,  0, 1,  64,  32,  32, 64, 16,  4, 1, 1, 1, 1, 0, 0, 1, 0, 0, { 1, 8, 2, 1}, {  1,  4,  1, 32}, { 1, 8, 1, 1}, {  1,  4,  1, 32}},
        {"fwd", "nhwc", miopenBFloat16,  0, 0,  64,  32,  32, 64, 16,  4, 1, 1, 1, 1, 0, 0, 1, 0, 0, { 1, 8, 2, 1}, {  1,  4,  1, 32}, { 1, 8, 1, 1}, {  1,  4,  1, 32}},
        {"fwd", "nhwc", miopenBFloat16,  0, 1,  64,  32,  16, 64, 16,  4, 1, 1, 1, 1, 0, 0, 0, 1, 0, { 1, 1, 8, 1}, {  1, 16,  1,  8}, { 1, 1, 4, 1}, {  1, 16,  1,  8}},
        {"fwd", "nhwc", miopenBFloat16,  0, 1,  32, 256,  32, 16, 64,  4, 1, 1, 1, 2, 0, 0, 0, 0, 0, { 1, 4, 1, 1}, {  1,  8,  1, 32}, { 1, 4, 8, 1}, {  1,  8,  1, 32}},
        {"fwd", "nhwc", miopenBFloat16,  0, 0,  32, 256,  32, 16, 64,  4, 1, 1, 1, 2, 0, 0, 0, 0, 0, { 1, 4, 1, 1}, {  1,  8,  1, 32}, { 1, 4, 8, 1}, {  1,  8,  1, 32}},
        {"fwd", "nhwc", miopenBFloat16,  0, 1,  32, 256,  32, 16, 64,  4, 1, 1, 1, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 1}, {  1,  8,  1, 32}, { 1, 4, 8, 1}, {  1,  8,  1, 32}},
        {"fwd", "nhwc", miopenBFloat16,  0, 0,  32, 256,  32, 16, 64,  4, 1, 1, 1, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 1}, {  1,  8,  1, 32}, { 1, 4, 8, 1}, {  1,  8,  1, 32}},
        {"fwd", "nhwc", miopenBFloat16,  0, 1,  32, 128,  32, 16, 64,  4, 1, 1, 1, 1, 0, 0, 0, 0, 0, { 1, 4, 1, 1}, {  1,  8,  1, 32}, { 1, 4, 4, 1}, {  1,  8,  1, 32}},
        {"fwd", "nhwc", miopenBFloat16,  0, 0,  32, 128,  32, 16, 64,  4, 1, 1, 1, 1, 0, 0, 0, 0, 0, { 1, 4, 1, 1}, {  1,  8,  1, 32}, { 1, 4, 4, 1}, {  1,  8,  1, 32}},
        {"fwd", "nhwc", miopenBFloat16,  0, 1,  32, 128,  32, 16, 64,  4, 1, 1, 1, 1, 0, 0, 1, 0, 0, { 1, 4, 1, 1}, {  1,  8,  1, 32}, { 1, 4, 4, 1}, {  1,  8,  1, 32}},
        {"fwd", "nhwc", miopenBFloat16,  0, 0,  32, 128,  32, 16, 64,  4, 1, 1, 1, 1, 0, 0, 1, 0, 0, { 1, 4, 1, 1}, {  1,  8,  1, 32}, { 1, 4, 4, 1}, {  1,  8,  1, 32}},
        {"fwd", "nhwc", miopenBFloat16,  0, 1,  32,  64,  32, 16, 64,  4, 1, 1, 1, 1, 0, 0, 0, 0, 0, { 1, 8, 1, 1}, {  1,  4,  1, 32}, { 1, 8, 2, 1}, {  1,  4,  1, 32}},
        {"fwd", "nhwc", miopenBFloat16,  0, 0,  32,  64,  32, 16, 64,  4, 1, 1, 1, 1, 0, 0, 0, 0, 0, { 1, 8, 1, 1}, {  1,  4,  1, 32}, { 1, 8, 2, 1}, {  1,  4,  1, 32}},
        {"fwd", "nhwc", miopenBFloat16,  0, 1,  32,  64,  32, 16, 64,  4, 1, 1, 1, 1, 0, 0, 1, 0, 0, { 1, 8, 1, 1}, {  1,  4,  1, 32}, { 1, 8, 2, 1}, {  1,  4,  1, 32}},
        {"fwd", "nhwc", miopenBFloat16,  0, 0,  32,  64,  32, 16, 64,  4, 1, 1, 1, 1, 0, 0, 1, 0, 0, { 1, 8, 1, 1}, {  1,  4,  1, 32}, { 1, 8, 2, 1}, {  1,  4,  1, 32}},
    };
    // clang-format on
    return kernel_param_list;
}

// clang-format off
static inline PerformanceConfigAsmImplicitGemmGTCFwdXdlopsNHWC
GetFwdXdlopsNHWCConfigLargestTileFp32()
{
    return {"fwd", "nhwc", miopenFloat,  0, 1, 256,  64,  16, 32, 32,  2, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 4, 4, 1}, {  1,  4,  1, 64}, { 1, 4, 1, 1}, {  1,  4,  1, 64}};
}

static inline PerformanceConfigAsmImplicitGemmGTCFwdXdlopsNHWC
GetFwdXdlopsNHWCConfigLargestTileFp16()
{
    return {"fwd", "nhwc", miopenHalf,  0, 1, 256, 128,  32, 32, 32,  8, 2, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 8, 4, 1}, {  1,  4,  1, 64}, { 1, 8, 2, 1}, {  1,  4,  1, 64}};
}

static inline PerformanceConfigAsmImplicitGemmGTCFwdXdlopsNHWC
GetFwdXdlopsNHWCConfigLargestTileBf16()
{
    return {"fwd", "nhwc", miopenBFloat16,  0, 1, 256, 128,  32, 32, 32,  8, 2, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 8, 4, 1}, {  1,  4,  1, 64}, { 1, 8, 2, 1}, {  1,  4,  1, 64}};
}
// clang-format on

static std::tuple<size_t, // block_size
                  size_t, // grid_size
                  size_t> // splits_4G
GetImplicitGemmGtcDynamicFwdXdlopsNHWCKernel(
    const ProblemDescription& problem,
    const PerformanceConfigAsmImplicitGemmGTCFwdXdlopsNHWC& config)
{
    const int n      = ProblemInterpreter::GetBatchN(problem);
    const int k      = ProblemInterpreter::GetOutputChannelK(problem);
    const int ho     = ProblemInterpreter::GetOutputHeightHo(problem);
    const int wo     = ProblemInterpreter::GetOutputWidthWo(problem);
    const auto group = ProblemInterpreter::GetGroupCountG(problem);

    const int hi = ProblemInterpreter::GetInputHeightHi(problem);
    const int wi = ProblemInterpreter::GetInputWidthWi(problem);
    const int c  = ProblemInterpreter::GetInputChannelC(problem);

    auto splits_4G =
        igemm_split_batch_size(hi,
                               wi,
                               ho,
                               wo,
                               n,
                               k,
                               c,
                               miopen::GetTypeSize(ProblemInterpreter::GetInputDataType(problem)));

    const auto gemm_m = (n / splits_4G) * ho * wo;
    const auto gemm_n = k / group;
    size_t block_size = config.BlockSize();
    size_t grid_size =
        static_cast<size_t>(group) * integer_divide_ceil(gemm_m, config.gemm_m_per_block) *
        integer_divide_ceil(gemm_n, config.gemm_n_per_block) * (1 << config.gemm_k_global_split);
    return std::make_tuple(block_size, grid_size, splits_4G);
}

void PerformanceConfigAsmImplicitGemmGTCFwdXdlopsNHWC::HeuristicInit(
    const ExecutionContext&, const ProblemDescription& problem)
{
    static const std::vector<std::tuple<int, int, int>> tile_list_fp32 = {
        std::make_tuple(128, 128, 16),
        std::make_tuple(128, 128, 8),
        std::make_tuple(128, 64, 16),
        std::make_tuple(128, 64, 32),
        std::make_tuple(64, 128, 16),
        std::make_tuple(128, 32, 32),
        std::make_tuple(128, 32, 16),
        std::make_tuple(256, 64, 16),
        std::make_tuple(64, 256, 16),
        std::make_tuple(64, 64, 32),
        std::make_tuple(64, 32, 32),
        std::make_tuple(64, 16, 32),
        std::make_tuple(32, 64, 32),
        std::make_tuple(16, 64, 32),
    };

    static const std::vector<std::tuple<int, int, int>> tile_list_fp16 = {
        std::make_tuple(128, 128, 32),
        std::make_tuple(256, 128, 32),
        std::make_tuple(128, 256, 32),
        std::make_tuple(128, 64, 32),
        std::make_tuple(64, 128, 32),
        std::make_tuple(256, 64, 32),
        std::make_tuple(64, 256, 32),
        std::make_tuple(64, 64, 64),
        std::make_tuple(256, 32, 32),
        std::make_tuple(32, 256, 32),
        std::make_tuple(128, 32, 32),
        std::make_tuple(32, 128, 32),
        std::make_tuple(64, 32, 32),
        std::make_tuple(32, 64, 32),
    };

    static const std::vector<std::tuple<int, int, int>> tile_list_bfp16 = {
        std::make_tuple(128, 128, 32),
        std::make_tuple(256, 128, 32),
        std::make_tuple(128, 256, 32),
        std::make_tuple(128, 64, 32),
        std::make_tuple(64, 128, 32),
        std::make_tuple(256, 64, 32),
        std::make_tuple(64, 256, 32),
        std::make_tuple(64, 64, 64),
        std::make_tuple(256, 32, 32),
        std::make_tuple(32, 256, 32),
        std::make_tuple(128, 32, 32),
        std::make_tuple(32, 128, 32),
        std::make_tuple(64, 32, 32),
        std::make_tuple(32, 64, 32),
    };

#ifdef DEBUG_IGEMM_ASM_FWD_NHWC_CHECK_VALID_TILE_LIST
    const auto& c_list = GetFwdXdlopsNHWCConfigList();
    for(const auto& tile : tile_list_fp16)
    {
        int mp, np, kp;
        std::tie(mp, np, kp) = tile;
        bool found           = false;
        for(const auto& config : c_list)
        {
            if(config.precision == "fp32" || config.precision == "bf16")
                continue;
            if(config.gemm_m_per_block == mp && config.gemm_n_per_block == np &&
               config.gemm_k_per_block == kp &&
               !(config.tensor_a_thread_lengths[1] == 1 && config.tensor_b_thread_lengths[1] == 1))
            {
                // pad c configs can't be used in tile list
                found = true;
                break;
            }
        }
        if(!found)
        {
            MIOPEN_LOG_E("fp16 list can't find " << mp << "x" << np << "x" << kp);
            MIOPEN_THROW(miopenStatusInternalError);
        }
    }
    for(const auto& tile : tile_list_fp32)
    {
        int mp, np, kp;
        std::tie(mp, np, kp) = tile;
        bool found           = false;
        for(const auto& config : c_list)
        {
            if(config.precision == "fp16" || config.precision == "bf16")
                continue;
            if(config.gemm_m_per_block == mp && config.gemm_n_per_block == np &&
               config.gemm_k_per_block == kp &&
               !(config.tensor_a_thread_lengths[1] == 1 && config.tensor_b_thread_lengths[1] == 1))
            {
                // pad c configs can't be used in tile list
                found = true;
                break;
            }
        }
        if(!found)
        {
            MIOPEN_LOG_E("fp32 list can't find " << mp << "x" << np << "x" << kp);
            MIOPEN_THROW(miopenStatusInternalError);
        }
    }

    for(const auto& tile : tile_list_bfp16)
    {
        int mp, np, kp;
        std::tie(mp, np, kp) = tile;
        bool found           = false;
        for(const auto& config : c_list)
        {
            if(config.precision == "fp16" || config.precision == "fp32")
                continue;
            if(config.gemm_m_per_block == mp && config.gemm_n_per_block == np &&
               config.gemm_k_per_block == kp &&
               !(config.tensor_a_thread_lengths[1] == 1 && config.tensor_b_thread_lengths[1] == 1))
            {
                // pad c configs can't be used in tile list
                found = true;
                break;
            }
        }
        if(!found)
        {
            MIOPEN_LOG_E("fp32 list can't find " << mp << "x" << np << "x" << kp);
            MIOPEN_THROW(miopenStatusInternalError);
        }
    }
#endif

    const int n           = ProblemInterpreter::GetBatchN(problem);
    const int c           = ProblemInterpreter::GetInputChannelC(problem);
    const int k           = ProblemInterpreter::GetOutputChannelK(problem);
    const int ho          = ProblemInterpreter::GetOutputHeightHo(problem);
    const int wo          = ProblemInterpreter::GetOutputWidthWo(problem);
    const auto stride_h   = ProblemInterpreter::GetAdjustedAsmInputStrideH(problem);
    const auto stride_w   = ProblemInterpreter::GetAdjustedAsmInputStrideW(problem);
    const auto pad_h      = ProblemInterpreter::GetInputLeftPadH(problem);
    const auto pad_w      = ProblemInterpreter::GetInputLeftPadW(problem);
    const auto dilation_h = ProblemInterpreter::GetAdjustedConvolutionDilationH(problem);
    const auto dilation_w = ProblemInterpreter::GetAdjustedConvolutionDilationW(problem);
    const int y           = ProblemInterpreter::GetFilterHeightY(problem);
    const int x           = ProblemInterpreter::GetFilterWidthX(problem);
    const auto group      = ProblemInterpreter::GetGroupCountG(problem);

    size_t gemm_m = static_cast<size_t>(n) * ho * wo;
    size_t gemm_n = k / group;
    size_t gemm_k = (static_cast<size_t>(c) / group) * y * x;

    bool unit_conv = (x == 1) && (y == 1) && (stride_h == 1) && (stride_w == 1) &&
                     (dilation_h == 1) && (dilation_w == 1) && (pad_h == 0) && (pad_w == 0);
    bool not_support_vector_store =
        (problem.IsFp16() || problem.IsBfp16()) && ((k / group) % 2 != 0);
    int m_per_block, n_per_block, k_per_block;

    std::tie(m_per_block, n_per_block, k_per_block) = HeuristicInitMacroTileNoPadGemmK(
        gemm_m,
        gemm_n,
        gemm_k,
        problem.IsFp32() ? tile_list_fp32 : (problem.IsFp16() ? tile_list_fp16 : tile_list_bfp16));

    auto find_with_gemm_k_pad = [&]() {
        const auto& config_list = GetFwdXdlopsNHWCConfigList();
        size_t min_pad_pixel    = std::numeric_limits<std::size_t>::max();
        size_t selected_index   = 0;
        for(size_t i = 0; i < config_list.size(); i++)
        {
            const auto& config = config_list[i];
            if(!((problem.IsFp16() && config.precision == "fp16") ||
                 (problem.IsBfp16() && config.precision == "bf16") ||
                 (problem.IsFp32() && config.precision == "fp32")))
                continue;
            if(!(config.tensor_a_thread_lengths[1] == 1 && config.tensor_b_thread_lengths[1] == 1))
                continue;
            // If we go here, then this is our last hope.
            // This kind of kernel support any configs
            size_t cur_pad_pixel =
                ComputeMatrixPadSize(
                    gemm_m, config.gemm_m_per_block, gemm_k, config.gemm_k_per_block) +
                ComputeMatrixPadSize(
                    gemm_n, config.gemm_n_per_block, gemm_k, config.gemm_k_per_block) +
                ComputeMatrixPadSize(
                    gemm_m, config.gemm_m_per_block, gemm_n, config.gemm_n_per_block);
            if(cur_pad_pixel < min_pad_pixel)
            {
                min_pad_pixel  = cur_pad_pixel;
                selected_index = i;
            }
        }
        CopyParameters(config_list[selected_index]);
    };

    if((m_per_block == 0 && n_per_block == 0 && k_per_block == 0) || not_support_vector_store)
    {
        // not found, let's try  gemm_k pad now.
        find_with_gemm_k_pad();
    }
    else
    {
        // found a suitable m/n/k, now let's prepare other parmater and initialize one
        const auto& config_list = GetFwdXdlopsNHWCConfigList();
        for(const auto& config : config_list)
        {
            if(!((problem.IsFp16() && config.precision == "fp16") ||
                 (problem.IsBfp16() && config.precision == "bf16") ||
                 (problem.IsFp32() && config.precision == "fp32")))
                continue;

            if(m_per_block == config.gemm_m_per_block && n_per_block == config.gemm_n_per_block &&
               k_per_block == config.gemm_k_per_block)
            {
                bool need_k_split = false;
                if(problem.IsFp16())
                {
                    // fp16 have extra limitation on k size, which dicide if need use need_k_split
                    // or not
                    if(k % 8 != 0 && k % 2 == 0)
                    {
                        need_k_split = true;
                    }
                }
                size_t current_grid_size;
                std::tie(std::ignore, current_grid_size, std::ignore) =
                    GetImplicitGemmGtcDynamicFwdXdlopsNHWCKernel(problem, config);
                size_t gks = ComputeLog2GemmKGlobalSplitsWith2DMerge(current_grid_size,
                                                                     1200,
                                                                     c / group,
                                                                     1,
                                                                     config.gemm_k_per_block,
                                                                     FWD_MAX_GEMM_K_SPLITS);
                need_k_split |= gks != 0;

                if((unit_conv && config.nxe == 0) || (!unit_conv && config.nxe != 0))
                {
                    if(!config.IsValid(problem)) // last check before assigning a heuristic value
                        continue;
                    CopyParameters(config);
                    if(need_k_split)
                    {
                        if(env::disabled(MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_ASM_PK_ATOMIC_ADD_FP16))
                        {
                            if(problem.IsFp16() && gks > 0)
                                vector_store = 1;
                        }
                        if(gks > 0)
                            gemm_k_global_split = static_cast<int>(gks);
                    }
                    return;
                }
                else
                    continue;
            }
        }
        // last try
        find_with_gemm_k_pad();
    }
}

bool PerformanceConfigAsmImplicitGemmGTCFwdXdlopsNHWC::SetNextValue(const ProblemDescription&)
{
    if(use_spare_set)
    {
        const auto& config_list = GetFwdXdlopsNHWCConfigList();
        if(IsDefaultConstructed())
        {
            CopyParameters(config_list[index]);
        }
        else
        {
            if(gemm_k_global_split != 0)
            {
                if(NextLinear<1, FWD_MAX_GEMM_K_SPLITS>(gemm_k_global_split))
                    index++;
                else
                    return true;
            }
            else
            {
                index++;
            }
            if(index >= config_list.size())
                return false;
            CopyParameters(config_list[index]);
        }
        return true;
    }
    else
    {
        // always break generic search of main set (no spare), make sure we can use spare set
        return false;
    }
}

bool PerformanceConfigAsmImplicitGemmGTCFwdXdlopsNHWC::IsValidValue() const
{
    if(IsDefaultConstructed())
        return true;
    const auto& config_list = GetFwdXdlopsNHWCConfigList();
    if(index < config_list.size() && *this == config_list[index])
        return true;
    return miopen::any_of(config_list, [&](auto v) { return (*this == v); });
}

bool PerformanceConfigAsmImplicitGemmGTCFwdXdlopsNHWC::IsValid(
    const ProblemDescription& problem) const
{
    if(IsDefaultConstructed())
        return false;

    if(!((problem.IsFp16() && precision == "fp16") || (problem.IsFp32() && precision == "fp32") ||
         (problem.IsBfp16() && precision == "bf16")))
    {
        return false;
    }

    if(env::disabled(MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_ASM_PK_ATOMIC_ADD_FP16))
    {
        if(problem.IsFp16() && gemm_k_global_split != 0 && vector_store != 1)
            return false;
    }

    const int c           = ProblemInterpreter::GetInputChannelC(problem);
    const int k           = ProblemInterpreter::GetOutputChannelK(problem);
    const auto group      = ProblemInterpreter::GetGroupCountG(problem);
    const auto stride_h   = ProblemInterpreter::GetAdjustedAsmInputStrideH(problem);
    const auto stride_w   = ProblemInterpreter::GetAdjustedAsmInputStrideW(problem);
    const auto pad_h      = ProblemInterpreter::GetInputLeftPadH(problem);
    const auto pad_w      = ProblemInterpreter::GetInputLeftPadW(problem);
    const auto dilation_h = ProblemInterpreter::GetAdjustedConvolutionDilationH(problem);
    const auto dilation_w = ProblemInterpreter::GetAdjustedConvolutionDilationW(problem);
    const int y           = ProblemInterpreter::GetFilterHeightY(problem);
    const int x           = ProblemInterpreter::GetFilterWidthX(problem);

    bool unit_conv = (x == 1) && (y == 1) && (stride_h == 1) && (stride_w == 1) &&
                     (dilation_h == 1) && (dilation_w == 1) && (pad_h == 0) && (pad_w == 0);

    // extra large size check
    {
        size_t current_block_size, current_grid_size, current_splits_4G;
        std::tie(current_block_size, current_grid_size, current_splits_4G) =
            GetImplicitGemmGtcDynamicFwdXdlopsNHWCKernel(problem, *this);

        if(current_splits_4G == 0)
            return false;

        if(current_block_size * current_grid_size * current_splits_4G > 0xffffffffULL)
            return false;

        // use_workspace = 1; ATOMIC_ADD_FP16
        if(problem.IsFp16() && gemm_k_global_split != 0 && vector_store != 1 &&
           current_splits_4G > 1)
            return false;
    }

    if(merge_e != 0)
    {
        uint32_t s_move_slice_k_y = (gemm_k_per_block / (x * (c / group))) % y;
        uint32_t s_move_slice_k_x = (gemm_k_per_block / (c / group)) % x;
        uint32_t s_move_slice_k_c = gemm_k_per_block % (c / group);
        if((c / group) >= 0xffffff || y >= 0xffffff || x >= 0xffffff) // 24 bit
            return false;
        if(s_move_slice_k_y >= 256 || s_move_slice_k_x >= 256 || s_move_slice_k_c >= 256) // 8 bit
            return false;
    }

    const bool is_gemm_k_split = gemm_k_global_split != 0;
    const int gemm_k_shift     = gemm_k_global_split != 0 ? 1 : 0;

    // gkgs check
    if(is_gemm_k_split)
    {
        if(gemm_k_global_split >
           igemm_get_max_gks(c / group, gemm_k_per_block, FWD_MAX_GEMM_K_SPLITS))
            return false;
    }

    if(!(tensor_a_thread_lengths[1] == 1 && tensor_b_thread_lengths[1] == 1))
    {
        auto splited_c = (c / group);
        // if both 1, indicate padded c support
        if((splited_c >> gemm_k_shift) == 0 ||
           (splited_c % (gemm_k_per_block << gemm_k_shift) != 0))
            return false;

        // also, add this restriction to k, for vector write out
        if(problem.IsFp16() || problem.IsBfp16())
        {
            if(is_gemm_k_split)
            {
                if((k / group) % 2 != 0)
                    return false;
            }
            else
            {
                if((k / group) % gcd(gemm_n_per_block, vector_store == 0 ? 8 : vector_store) != 0)
                    return false;
            }
        }
    }

    if((nxe == 0) && !unit_conv)
    {
        return false;
    }

    // add more restriction for spare
    // extra limitation
    if(use_spare_set)
    {
        // non 1x1 kernel(except padding gemm_k) can't run 1x1 case
        if(unit_conv &&
           ((nxe != 0) && !(tensor_a_thread_lengths[1] == 1 && tensor_b_thread_lengths[1] == 1)))
            return false;
    }

    return true;
}

PerformanceConfigAsmImplicitGemmGTCFwdXdlopsNHWC
ConvAsmImplicitGemmGTCDynamicFwdXdlopsNHWC::GetDefaultPerformanceConfig(
    const ExecutionContext& ctx, const ProblemDescription& problem) const
{
    PerformanceConfigAsmImplicitGemmGTCFwdXdlopsNHWC pp;
    pp.HeuristicInit(ctx, problem);
    MIOPEN_LOG_I(pp.ToString());
    return pp;
}

bool ConvAsmImplicitGemmGTCDynamicFwdXdlopsNHWC::IsValidPerformanceConfig(
    const ExecutionContext&,
    const ProblemDescription& problem,
    const PerformanceConfigAsmImplicitGemmGTCFwdXdlopsNHWC& config) const
{
    return config.IsValidValue() && config.IsValid(problem);
}

PerformanceConfigAsmImplicitGemmGTCFwdXdlopsNHWC
ConvAsmImplicitGemmGTCDynamicFwdXdlopsNHWC::Search(const ExecutionContext& ctx,
                                                   const ProblemDescription& problem,
                                                   const AnyInvokeParams& invoke_ctx) const
{
    return GenericSearch(*this, ctx, problem, invoke_ctx);
}

size_t ConvAsmImplicitGemmGTCDynamicFwdXdlopsNHWC::GetWorkspaceSize(
    const ExecutionContext& ctx, const ProblemDescription& problem) const
{
    const int hi          = ProblemInterpreter::GetInputHeightHi(problem);
    const int wi          = ProblemInterpreter::GetInputWidthWi(problem);
    const int n           = ProblemInterpreter::GetBatchN(problem);
    const int k           = ProblemInterpreter::GetOutputChannelK(problem);
    const int c           = ProblemInterpreter::GetInputChannelC(problem);
    const int ho          = ProblemInterpreter::GetOutputHeightHo(problem);
    const int wo          = ProblemInterpreter::GetOutputWidthWo(problem);
    const int y           = ProblemInterpreter::GetFilterHeightY(problem);
    const int x           = ProblemInterpreter::GetFilterWidthX(problem);
    const auto group      = ProblemInterpreter::GetGroupCountG(problem);
    const auto is_nchw    = problem.IsLayoutDefault();
    size_t workspace_size = 0;

    size_t size_trans_input  = 0;
    size_t size_trans_weight = 0;
    size_t size_trans_output = 0;
    size_t size_tensor_cast  = 0;

    if(is_nchw)
    {

        TransposeSolutionDefault2Nhwc trans_input(
            ctx, ProblemInterpreter::GetInputDataType(problem), n, c, hi, wi);
        TransposeSolutionDefault2Nhwc trans_weight(ctx,
                                                   ProblemInterpreter::GetWeightsDataType(problem),
                                                   k,
                                                   c / group,
                                                   y,
                                                   x); // group * k_per_group as batch for weight
        TransposeSolutionNhwc2Default trans_output(
            ctx, ProblemInterpreter::GetOutputDataType(problem), n, k, ho, wo);

        if(!trans_input.IsSkippable())
            size_trans_input = trans_input.GetOutputTensorSize();
        if(!trans_weight.IsSkippable())
            size_trans_weight = trans_weight.GetOutputTensorSize();
        if(!trans_output.IsSkippable())
            size_trans_output = trans_output.GetOutputTensorSize();
    }

    if(!problem.IsFp32())
    {
        size_tensor_cast =
            miopen::GetTypeSize(miopenFloat) // The intermediate output of the 1st
                                             // kernel is FP32, when using FP32 atomic
            * n * k * ho * wo;
    }

    MultiBufferWorkspaceTraits wt(
        {size_trans_input, size_trans_weight, size_trans_output, size_tensor_cast});
    workspace_size = wt.GetSize();

    return workspace_size;
}

bool ConvAsmImplicitGemmGTCDynamicFwdXdlopsNHWC::IsApplicable(
    const ExecutionContext& ctx, const ProblemDescription& problem) const
{
    if(env::disabled(MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_ASM_FWD_GTC_XDLOPS_NHWC))
        return false;

    if(problem.GetConv().attribute.deterministic)
        return false;

#if WORKAROUND_ISSUE_1979
    if(ProblemInterpreter::GetGroupCountG(problem) > 1)
        return false;
#endif

#if WORKAROUND_ISSUE_2624
    {
        const int c           = ProblemInterpreter::GetInputChannelC(problem);
        const auto dilation_h = ProblemInterpreter::GetAdjustedConvolutionDilationH(problem);
        if(c <= 4 && dilation_h > 1)
            return false;
    }
#endif

    const auto device_name = ctx.GetStream().GetDeviceName();
    if((device_name != "gfx908") && (device_name != "gfx90a") && (device_name != "gfx942") &&
       (!StartsWith(device_name, "gfx95")))
        return false;

    if(!(problem.IsLayoutDefault() || problem.IsLayoutNHWC()))
        return false;

    if(!ctx.use_asm_kernels)
        return false;

    if(!problem.IsDirectionForward())
        return false;

    if(!problem.Is2d())
        return false;

    if(problem.HasNonPackedTensors())
        return false;

    if(!problem.AllTensorsDimsFitIntoInt())
        return false;

    if(!problem.IsFp32() && !problem.IsFp16() &&
       !(problem.IsBfp16() &&
         (device_name == "gfx90a" || device_name == "gfx942" || StartsWith(device_name, "gfx95"))))
        return false;

    if(problem.IsTensorsCasted())
        return false;

    if(!ctx.rmv.IsV3())
        return false;

    const auto& target = ctx.GetStream().GetTargetProperties();
    if(target.Xnack() && *target.Xnack())
        return false; // NOLINT (readability-simplify-boolean-expr)

    if(0 ==
       igemm_split_batch_size(ProblemInterpreter::GetInputHeightHi(problem),
                              ProblemInterpreter::GetInputWidthWi(problem),
                              ProblemInterpreter::GetOutputHeightHo(problem),
                              ProblemInterpreter::GetOutputWidthWo(problem),
                              ProblemInterpreter::GetBatchN(problem),
                              ProblemInterpreter::GetOutputChannelK(problem),
                              ProblemInterpreter::GetInputChannelC(problem),
                              miopen::GetTypeSize(ProblemInterpreter::GetInputDataType(problem))))
        return false;

    {
        auto largest_config = problem.IsFp32()
                                  ? GetFwdXdlopsNHWCConfigLargestTileFp32()
                                  : (problem.IsFp16() ? GetFwdXdlopsNHWCConfigLargestTileFp16()
                                                      : GetFwdXdlopsNHWCConfigLargestTileBf16());
        size_t current_block_size, current_grid_size, current_splits_4G;
        std::tie(current_block_size, current_grid_size, current_splits_4G) =
            GetImplicitGemmGtcDynamicFwdXdlopsNHWCKernel(problem, largest_config);

        if(current_block_size * current_grid_size * current_splits_4G > 0xffffffffULL)
            return false;
    }

    return true;
}

ConvSolution ConvAsmImplicitGemmGTCDynamicFwdXdlopsNHWC::GetSolution(
    const ExecutionContext& ctx,
    const ProblemDescription& problem,
    const PerformanceConfigAsmImplicitGemmGTCFwdXdlopsNHWC& config) const
{
    ConvSolution result;
    KernelInfo kernel;

    size_t block_size;
    size_t grid_size;

    int splits_4G;

    std::tie(block_size, grid_size, splits_4G) =
        GetImplicitGemmGtcDynamicFwdXdlopsNHWCKernel(problem, config);

    std::string kernel_name = config.ToKernelName(ctx);

    const auto required_workspace_size = GetWorkspaceSize(ctx, problem);
    result.workspace_sz                = required_workspace_size;

    kernel.kernel_file = kernel_name + ".s";
    kernel.kernel_name = kernel_name;
    kernel.g_wk.clear();
    kernel.g_wk.push_back(grid_size * block_size);
    kernel.g_wk.push_back(splits_4G);
    kernel.g_wk.push_back(1);
    kernel.l_wk.clear();
    kernel.l_wk.push_back(block_size);
    kernel.l_wk.push_back(1);
    kernel.l_wk.push_back(1);

    const auto isGfx90aFp16altSupport =
        (ctx.GetStream().GetDeviceName() == "gfx90a") && problem.IsFp16();

    const auto is_nchw = problem.IsLayoutDefault();

    result.construction_params.push_back(kernel);
    std::ostringstream options;
    std::ostringstream msg;
    GenerateClangDefsym(options, "ROCM_METADATA_VERSION", ctx.rmv.UseV3() ? 5 : 4);
    if(ctx.GetStream().GetDeviceName() == "gfx942")
    {
        GenerateClangDefsym(options, "force_sc0_sc1", 0);
        GenerateClangDefsym(options, "atomic_add_using_cas", 0);
        if(miopen::IsLogging(LoggingLevel::Info2))
            msg << ", force_sc0_sc1:0, atomic_add_using_cas:0 (gfx942)";
    }

    std::ostringstream opts_0(options.str(), std::ios_base::ate);
    if(isGfx90aFp16altSupport)
        GenerateClangDefsym(opts_0, "igemm_fwd_fp16_alt_impl", 0);
    result.construction_params[0].comp_options = opts_0.str();

    if(isGfx90aFp16altSupport)
    {
        result.construction_params.push_back(kernel);
        std::ostringstream opts_1(options.str(), std::ios_base::ate);
        GenerateClangDefsym(opts_1, "igemm_fwd_fp16_alt_impl", 1);
        result.construction_params[1].comp_options = opts_1.str();
        if(miopen::IsLogging(LoggingLevel::Info2))
            msg << ", fp16_alt:" << problem.GetConv().attribute.gfx90aFp16alt.GetFwd();
    }

    if(is_nchw)
    {
        const int hi     = ProblemInterpreter::GetInputHeightHi(problem);
        const int wi     = ProblemInterpreter::GetInputWidthWi(problem);
        const int n      = ProblemInterpreter::GetBatchN(problem);
        const int k      = ProblemInterpreter::GetOutputChannelK(problem);
        const int c      = ProblemInterpreter::GetInputChannelC(problem);
        const int ho     = ProblemInterpreter::GetOutputHeightHo(problem);
        const int wo     = ProblemInterpreter::GetOutputWidthWo(problem);
        const int y      = ProblemInterpreter::GetFilterHeightY(problem);
        const int x      = ProblemInterpreter::GetFilterWidthX(problem);
        const auto group = ProblemInterpreter::GetGroupCountG(problem);

        TransposeSolutionDefault2Nhwc trans_input(
            ctx, ProblemInterpreter::GetInputDataType(problem), n, c, hi, wi);
        TransposeSolutionDefault2Nhwc trans_weight(ctx,
                                                   ProblemInterpreter::GetWeightsDataType(problem),
                                                   k,
                                                   c / group,
                                                   y,
                                                   x); // group * k_per_group as batch for weight
        TransposeSolutionNhwc2Default trans_output(
            ctx, ProblemInterpreter::GetOutputDataType(problem), n, k, ho, wo);

        if(!trans_input.IsSkippable())
        {
            result.construction_params.push_back(trans_input.GetKernelInfo());
            if(miopen::IsLogging(LoggingLevel::Info2))
                msg << ", inp trans:" << trans_input.GetKernelName();
        }
        if(!trans_weight.IsSkippable())
        {
            result.construction_params.push_back(trans_weight.GetKernelInfo());
            if(miopen::IsLogging(LoggingLevel::Info2))
                msg << ", wei trans:" << trans_weight.GetKernelName();
        }
        if(!trans_output.IsSkippable())
        {
            result.construction_params.push_back(trans_output.GetKernelInfo());
            if(miopen::IsLogging(LoggingLevel::Info2))
                msg << ", out trans:" << trans_output.GetKernelName();
        }
    }

    MIOPEN_LOG_I2(SolverDbId() << ": " << config.ToString() << msg.str());

    result.invoker_factory =
        miopen::conv::MakeImplGemmDynamicForwardXdlopsNHWCInvokerFactory(ctx, problem, config);
    return result;
}

} // namespace conv
} // namespace solver
} // namespace miopen
