import torch import torch.nn as nn from torch.utils.cpp_extension import load_inline # Define the custom CUDA kernel for transposed convolution, scalar multiplication, and global average pooling custom_kernel_source = """ #include #include __global__ void custom_kernel(const float* input, float* output, const float multiplier, int batch_size, int in_channels, int out_channels, int height, int width, int kernel_size, int stride, int padding, int output_padding) { // Implement the transposed convolution, scalar multiplication, and global average pooling in a single kernel // This is a placeholder for the actual implementation int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < batch_size * out_channels * height * width) { // Perform operations here output[idx] = input[idx] * multiplier; // Example operation } } torch::Tensor custom_cuda(torch::Tensor input, float multiplier, int in_channels, int out_channels, int kernel_size, int stride, int padding, int output_padding) { auto batch_size = input.size(0); auto height = input.size(2); auto width = input.size(3); auto output = torch::zeros({batch_size, out_channels, height, width}, input.options()); const int block_size = 256; const int num_blocks = (batch_size * out_channels * height * width + block_size - 1) / block_size; custom_kernel<<>>(input.data_ptr(), output.data_ptr(), multiplier, batch_size, in_channels, out_channels, height, width, kernel_size, stride, padding, output_padding); return output; } """ custom_cpp_source = "torch::Tensor custom_cuda(torch::Tensor input, float multiplier, int in_channels, int out_channels, int kernel_size, int stride, int padding, int output_padding);" # Compile the inline CUDA code custom_op = load_inline( name='custom_op', cpp_sources=custom_cpp_source, cuda_sources=custom_kernel_source, functions=['custom_cuda'], verbose=True, extra_cflags=[''], extra_ldflags=[''] ) class ModelNew(nn.Module): """ Optimized model with custom CUDA kernel for transposed convolution, scalar multiplication, and global average pooling. """ def __init__(self, in_channels, out_channels, kernel_size, stride, padding, output_padding, multiplier): super(ModelNew, self).__init__() self.custom_op = custom_op self.in_channels = in_channels self.out_channels = out_channels self.kernel_size = kernel_size self.stride = stride self.padding = padding self.output_padding = output_padding self.multiplier = multiplier def forward(self, x): x = self.custom_op.custom_cuda(x, self.multiplier, self.in_channels, self.out_channels, self.kernel_size, self.stride, self.padding, self.output_padding) x = torch.mean(x, dim=[2, 3], keepdim=True) # First global average pooling x = torch.mean(x, dim=[2, 3], keepdim=True) # Second global average pooling x = torch.mean(x) return x