{ "metadata": { "name": "RadixSort" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Radix Sort\n", "\n", "The purpose of this code is to implement a canonical radix sort algorithm on the GPU." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## generate data" ] }, { "cell_type": "code", "collapsed": false, "input": [ "n = 10000\n", "input_keys = (numpy.random.rand(n) * n).astype(numpy.uint32)\n", "input_values = input_keys.astype(numpy.float32)\n", "\n", "print input_keys\n", "print input_values" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "[9875 764 6555 ..., 5036 7990 9689]\n", "[ 9875. 764. 6555. ..., 5036. 7990. 9689.]\n" ] } ], "prompt_number": 1 }, { "cell_type": "markdown", "metadata": {}, "source": [ "## perform mostly CPU radix sort " ] }, { "cell_type": "code", "collapsed": false, "input": [ "import split\n", "\n", "num_bits_per_element = 32\n", "split_manager = split.SplitManager(15000)\n", "\n", "flag_data = zeros_like(input_keys)\n", "split_keys_old = input_keys.copy()\n", "split_values_old = input_values.copy()\n", "split_keys_new = zeros_like(input_keys)\n", "split_values_new = zeros_like(input_values)\n", "\n", "for b in range(num_bits_per_element):\n", "\n", " mask = 2**b\n", "\n", " for i in range(n):\n", " input_value = split_keys_old[i]\n", " flag_data[i] = not (input_value & mask)\n", "\n", " split_manager.split_host(split_keys_old, flag_data, split_keys_new)\n", " split_manager.split_host(split_values_old, flag_data, split_values_new)\n", "\n", " split_keys_old, split_keys_new = split_keys_new, split_keys_old\n", " split_values_old, split_values_new = split_values_new, split_values_old\n", " \n", " \n", " \n", "print input_keys\n", "print input_values\n", "print split_keys_old\n", "print split_values_old\n", "print numpy.sort(input_keys)\n", "print\n", " \n", "print \"Difference between GPU and CPU keys (should be 0.0%%): %f\" % numpy.linalg.norm(split_keys_new - numpy.sort(input_keys))\n", "print \"Difference between GPU and CPU values (should be 0.0%%): %f\" % numpy.linalg.norm(split_values_new - numpy.sort(input_keys).astype(float32))" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "[9875 764 6555 ..., 5036 7990 9689]\n", "[ 9875. 764. 6555. ..., 5036. 7990. 9689.]\n", "[ 0 1 2 ..., 9999 9999 9999]\n", "[ 0.00000000e+00 1.00000000e+00 2.00000000e+00 ..., 9.99900000e+03\n", " 9.99900000e+03 9.99900000e+03]\n", "[ 0 1 2 ..., 9999 9999 9999]\n", "\n", "Difference between GPU and CPU keys (should be 0.0%): 0.000000\n", "Difference between GPU and CPU values (should be 0.0%): 0.000000\n" ] } ], "prompt_number": 2 }, { "cell_type": "markdown", "metadata": {}, "source": [ "## perform fully GPU radix sort " ] }, { "cell_type": "code", "collapsed": false, "input": [ "import pycuda.autoinit\n", "import pycuda.driver\n", "import pycuda.compiler\n", "import split\n", "\n", "source_module = pycuda.compiler.SourceModule \\\n", "(\n", "\"\"\"\n", "__global__ void radix_sort_compute_flags(\n", " unsigned int* d_input_data,\n", " unsigned int* d_output_data,\n", " int mask,\n", " int n )\n", "{\n", " int global_index_1d = ( blockIdx.x * blockDim.x ) + threadIdx.x;\n", "\n", " if ( global_index_1d < n )\n", " {\n", " unsigned int input_value = d_input_data[ global_index_1d ];\n", "\n", " if ( input_value & mask )\n", " {\n", " d_output_data[ global_index_1d ] = 0;\n", " }\n", " else\n", " {\n", " d_output_data[ global_index_1d ] = 1;\n", " }\n", " }\n", "}\n", "\"\"\"\n", ")\n", "\n", "radix_sort_compute_flags_function = source_module.get_function(\"radix_sort_compute_flags\")\n", "\n", "size_of_element_bytes = 4\n", "size_of_element_bits = 32\n", "max_num_elements = 15000\n", "num_bytes = max_num_elements * size_of_element_bytes\n", "\n", "input_keys_device = pycuda.driver.mem_alloc(num_bytes)\n", "input_values_device = pycuda.driver.mem_alloc(num_bytes)\n", "flag_data_device = pycuda.driver.mem_alloc(num_bytes)\n", "split_keys_old_device = pycuda.driver.mem_alloc(num_bytes)\n", "split_values_old_device = pycuda.driver.mem_alloc(num_bytes)\n", "split_keys_new_device = pycuda.driver.mem_alloc(num_bytes)\n", "split_values_new_device = pycuda.driver.mem_alloc(num_bytes)\n", "\n", "split_manager = split.SplitManager(max_num_elements)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 3 }, { "cell_type": "code", "collapsed": false, "input": [ "pycuda.driver.memcpy_htod(input_keys_device, input_keys)\n", "pycuda.driver.memcpy_htod(input_values_device, input_values)\n", "pycuda.driver.memcpy_htod(split_keys_old_device, input_keys)\n", "pycuda.driver.memcpy_htod(split_values_old_device, input_values)\n", "\n", "pycuda.driver.memset_d32(flag_data_device, 0, n)\n", "pycuda.driver.memset_d32(split_keys_new_device, 0, n)\n", "pycuda.driver.memset_d32(split_values_new_device, 0, n)\n", "\n", "for b in range(num_bits_per_element):\n", "\n", " mask = numpy.int32(2**numpy.int8(b))\n", "\n", " radix_sort_compute_flags_funcion_block = (512,1,1)\n", " num_blocks = int(ceil(float(n) / float(radix_sort_compute_flags_funcion_block[0])))\n", " radix_sort_compute_flags_funcion_grid = (num_blocks, 1)\n", "\n", " radix_sort_compute_flags_function(\n", " split_keys_old_device,\n", " flag_data_device,\n", " numpy.int32(mask),\n", " numpy.int32(n),\n", " block=radix_sort_compute_flags_funcion_block,\n", " grid=radix_sort_compute_flags_funcion_grid)\n", "\n", " split_manager.split_device(split_keys_old_device, flag_data_device, split_keys_new_device, n)\n", " split_manager.split_device(split_values_old_device, flag_data_device, split_values_new_device, n)\n", "\n", " split_keys_old_device, split_keys_new_device = split_keys_new_device, split_keys_old_device\n", " split_values_old_device, split_values_new_device = split_values_new_device, split_values_old_device\n", " \n", "pycuda.driver.memcpy_dtoh(split_keys_new, split_keys_old_device)\n", "pycuda.driver.memcpy_dtoh(split_values_new, split_values_old_device)\n", "\n", "\n", "\n", "print input_keys\n", "print input_values\n", "print split_keys_new\n", "print split_values_new\n", "print numpy.sort(input_keys)\n", "print\n", "\n", "print \"Difference between GPU and CPU keys (should be 0.0%%): %f\" % numpy.linalg.norm(split_keys_new - numpy.sort(input_keys))\n", "print \"Difference between GPU and CPU values (should be 0.0%%): %f\" % numpy.linalg.norm(split_values_new - numpy.sort(input_keys).astype(float32))" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "[9875 764 6555 ..., 5036 7990 9689]\n", "[ 9875. 764. 6555. ..., 5036. 7990. 9689.]\n", "[ 0 1 2 ..., 9999 9999 9999]\n", "[ 0.00000000e+00 1.00000000e+00 2.00000000e+00 ..., 9.99900000e+03\n", " 9.99900000e+03 9.99900000e+03]\n", "[ 0 1 2 ..., 9999 9999 9999]\n", "\n", "Difference between GPU and CPU keys (should be 0.0%): 0.000000\n", "Difference between GPU and CPU values (should be 0.0%): 0.000000\n" ] }, { "output_type": "stream", "stream": "stderr", "text": [ "-c:12: RuntimeWarning: invalid value encountered in power\n" ] } ], "prompt_number": 4 }, { "cell_type": "markdown", "metadata": {}, "source": [ "## define GPU radix sort class" ] }, { "cell_type": "code", "collapsed": false, "input": [ "import math\n", "import numpy\n", "import pycuda.autoinit\n", "import pycuda.driver\n", "import pycuda.compiler\n", "import split\n", "\n", "class RadixSortManager:\n", " \n", " source_module = pycuda.compiler.SourceModule \\\n", " (\n", " \"\"\"\n", " __global__ void radix_sort_compute_flags_ascending(\n", " unsigned int* d_input_data,\n", " unsigned int* d_output_data,\n", " int mask,\n", " int n )\n", " {\n", " int global_index_1d = ( blockIdx.x * blockDim.x ) + threadIdx.x;\n", "\n", " if ( global_index_1d < n )\n", " {\n", " unsigned int input_value = d_input_data[ global_index_1d ];\n", "\n", " if ( input_value & mask )\n", " {\n", " d_output_data[ global_index_1d ] = 0;\n", " }\n", " else\n", " {\n", " d_output_data[ global_index_1d ] = 1;\n", " }\n", " }\n", " }\n", "\n", " __global__ void radix_sort_compute_flags_descending(\n", " unsigned int* d_input_data,\n", " unsigned int* d_output_data,\n", " int mask,\n", " int n )\n", " {\n", " int global_index_1d = ( blockIdx.x * blockDim.x ) + threadIdx.x;\n", "\n", " if ( global_index_1d < n )\n", " {\n", " unsigned int input_value = d_input_data[ global_index_1d ];\n", "\n", " if ( input_value & mask )\n", " {\n", " d_output_data[ global_index_1d ] = 1;\n", " }\n", " else\n", " {\n", " d_output_data[ global_index_1d ] = 0;\n", " }\n", " }\n", " } \n", " \"\"\"\n", " )\n", "\n", " _radix_sort_compute_flags_ascending_function = source_module.get_function(\"radix_sort_compute_flags_ascending\")\n", " _radix_sort_compute_flags_descending_function = source_module.get_function(\"radix_sort_compute_flags_descending\")\n", "\n", " _size_of_element_bytes = 4\n", " _size_of_element_bits = 32\n", " \n", " _max_num_elements = -1\n", " _num_bytes = -1\n", "\n", " _input_keys_device = -1\n", " _input_values_device = -1\n", " _flag_data_device = -1\n", " _split_keys_old_device = -1\n", " _split_values_old_device = -1\n", " _split_keys_new_device = -1\n", " _split_values_new_device = -1\n", "\n", " _split_manager = -1\n", " \n", " def __init__(self, max_num_elements):\n", " \n", " self._max_num_elements = max_num_elements\n", " self._num_bytes = self._max_num_elements * self._size_of_element_bytes\n", "\n", " self._input_keys_device = pycuda.driver.mem_alloc(self._num_bytes)\n", " self._input_values_device = pycuda.driver.mem_alloc(self._num_bytes)\n", " self._flag_data_device = pycuda.driver.mem_alloc(self._num_bytes)\n", " self._split_keys_old_device = pycuda.driver.mem_alloc(self._num_bytes)\n", " self._split_values_old_device = pycuda.driver.mem_alloc(self._num_bytes)\n", " self._output_keys_device = pycuda.driver.mem_alloc(self._num_bytes)\n", " self._output_values_device = pycuda.driver.mem_alloc(self._num_bytes)\n", "\n", " self._split_manager = split.SplitManager(max_num_elements)\n", "\n", " def __copy_input_htod_key(self, input_keys_host):\n", "\n", " assert input_keys_host.shape[0] <= self._max_num_elements\n", "\n", " assert \\\n", " input_keys_host.dtype == numpy.uint32 or \\\n", " input_keys_host.dtype == numpy.int32 or \\\n", " input_keys_host.dtype == numpy.float32\n", "\n", " pycuda.driver.memcpy_htod(self._input_keys_device, input_keys_host)\n", "\n", " def __copy_input_htod_key_value(self, input_keys_host, input_values_host):\n", "\n", " assert input_keys_host.shape[0] == input_values_host.shape[0]\n", " assert input_keys_host.shape[0] <= self._max_num_elements\n", "\n", " assert \\\n", " input_keys_host.dtype == numpy.uint32 or \\\n", " input_keys_host.dtype == numpy.int32 or \\\n", " input_keys_host.dtype == numpy.float32\n", "\n", " assert \\\n", " input_values_host.dtype == numpy.uint32 or \\\n", " input_values_host.dtype == numpy.int32 or \\\n", " input_values_host.dtype == numpy.float32\n", "\n", " pycuda.driver.memcpy_htod(self._input_keys_device, input_keys_host)\n", " pycuda.driver.memcpy_htod(self._input_values_device, input_values_host)\n", "\n", " def __radix_sort_key(self, input_keys_device, output_keys_device, num_elements, compute_flags_function):\n", "\n", " assert num_elements <= self._max_num_elements\n", "\n", " self._n = num_elements\n", "\n", " pycuda.driver.memcpy_dtod(self._split_keys_old_device, input_keys_device, self._n * self._size_of_element_bytes)\n", " \n", " pycuda.driver.memset_d32(self._flag_data_device, 0, self._n)\n", " pycuda.driver.memset_d32(output_keys_device, 0, self._n)\n", "\n", " for b in range(self._size_of_element_bits):\n", "\n", " mask = numpy.int32(2**numpy.int8(b))\n", "\n", " radix_sort_compute_flags_funcion_block = (512,1,1)\n", " num_blocks = int(math.ceil(float(self._n) / float(radix_sort_compute_flags_funcion_block[0])))\n", " radix_sort_compute_flags_funcion_grid = (num_blocks, 1)\n", " \n", " compute_flags_function(\n", " self._split_keys_old_device,\n", " self._flag_data_device,\n", " numpy.int32(mask),\n", " numpy.int32(self._n),\n", " block=radix_sort_compute_flags_funcion_block,\n", " grid=radix_sort_compute_flags_funcion_grid)\n", "\n", " self._split_manager.split_device(self._split_keys_old_device, self._flag_data_device, output_keys_device, self._n)\n", "\n", " self._split_keys_old_device, output_keys_device = output_keys_device, self._split_keys_old_device\n", "\n", " pycuda.driver.memcpy_dtod(output_keys_device, self._split_keys_old_device, self._n * self._size_of_element_bytes)\n", " \n", " def __radix_sort_key_value(self, input_keys_device, input_values_device, output_keys_device, output_values_device, num_elements, compute_flags_function):\n", "\n", " assert num_elements <= self._max_num_elements\n", "\n", " self._n = num_elements\n", "\n", " pycuda.driver.memcpy_dtod(self._split_keys_old_device, input_keys_device, self._n * self._size_of_element_bytes)\n", " pycuda.driver.memcpy_dtod(self._split_values_old_device, input_values_device, self._n * self._size_of_element_bytes)\n", " \n", " pycuda.driver.memset_d32(self._flag_data_device, 0, self._n)\n", " pycuda.driver.memset_d32(output_keys_device, 0, self._n)\n", " pycuda.driver.memset_d32(output_values_device, 0, self._n)\n", "\n", " for b in range(self._size_of_element_bits):\n", "\n", " mask = numpy.int32(2**numpy.int8(b))\n", "\n", " radix_sort_compute_flags_funcion_block = (512,1,1)\n", " num_blocks = int(math.ceil(float(self._n) / float(radix_sort_compute_flags_funcion_block[0])))\n", " radix_sort_compute_flags_funcion_grid = (num_blocks, 1)\n", " \n", " compute_flags_function(\n", " self._split_keys_old_device,\n", " self._flag_data_device,\n", " numpy.int32(mask),\n", " numpy.int32(self._n),\n", " block=radix_sort_compute_flags_funcion_block,\n", " grid=radix_sort_compute_flags_funcion_grid)\n", "\n", " self._split_manager.split_device(self._split_keys_old_device, self._flag_data_device, output_keys_device, self._n)\n", " self._split_manager.split_device(self._split_values_old_device, self._flag_data_device, output_values_device, self._n)\n", "\n", " self._split_keys_old_device, output_keys_device = output_keys_device, self._split_keys_old_device\n", " self._split_values_old_device, output_values_device = output_values_device, self._split_values_old_device\n", "\n", " pycuda.driver.memcpy_dtod(output_keys_device, self._split_keys_old_device, self._n * self._size_of_element_bytes)\n", " pycuda.driver.memcpy_dtod(output_values_device, self._split_values_old_device, self._n * self._size_of_element_bytes)\n", "\n", " def __copy_output_dtoh_key(self, output_keys_host):\n", "\n", " pycuda.driver.memcpy_dtoh(output_keys_host, self._output_keys_device)\n", "\n", " def __copy_output_dtoh_key_value(self, output_keys_host, output_values_host):\n", "\n", " pycuda.driver.memcpy_dtoh(output_keys_host, self._output_keys_device)\n", " pycuda.driver.memcpy_dtoh(output_values_host, self._output_values_device)\n", "\n", " def radix_sort_key_ascending_device(self, input_keys_device, output_keys_device, num_elements):\n", "\n", " self.__radix_sort_key(\n", " input_keys_device,\n", " output_keys_device,\n", " num_elements,\n", " self._radix_sort_compute_flags_ascending_function)\n", "\n", " def radix_sort_key_descending_device(self, input_keys_device, output_keys_device, num_elements):\n", "\n", " self.__radix_sort_key(\n", " input_keys_device,\n", " output_keys_device,\n", " num_elements,\n", " self._radix_sort_compute_flags_descending_function)\n", "\n", " def radix_sort_key_ascending_host(self, input_keys_host, output_keys_host):\n", "\n", " num_elements = input_keys_host.shape[0]\n", " \n", " self.__copy_input_htod_key(input_keys_host)\n", " self.radix_sort_key_ascending_device(self._input_keys_device, self._output_keys_device, num_elements)\n", " self.__copy_output_dtoh_key(output_keys_host)\n", "\n", " def radix_sort_key_descending_host(self, input_keys_host, output_keys_host):\n", "\n", " num_elements = input_keys_host.shape[0]\n", " \n", " self.__copy_input_htod_key(input_keys_host)\n", " self.radix_sort_key_descending_device(self._input_keys_device, self._output_keys_device, num_elements)\n", " self.__copy_output_dtoh_key(output_keys_host)\n", "\n", " def radix_sort_key_value_ascending_device(self, input_keys_device, input_values_device, output_keys_device, output_values_device, num_elements):\n", "\n", " self.__radix_sort_key_value(\n", " input_keys_device,\n", " input_values_device,\n", " output_keys_device,\n", " output_values_device,\n", " num_elements,\n", " self._radix_sort_compute_flags_ascending_function)\n", "\n", " def radix_sort_key_value_descending_device(self, input_keys_device, input_values_device, output_keys_device, output_values_device, num_elements):\n", "\n", " self.__radix_sort_key_value(\n", " input_keys_device,\n", " input_values_device,\n", " output_keys_device,\n", " output_values_device,\n", " num_elements,\n", " self._radix_sort_compute_flags_descending_function)\n", "\n", " def radix_sort_key_value_ascending_host(self, input_keys_host, input_values_host, output_keys_host, output_values_host):\n", "\n", " num_elements = input_keys_host.shape[0]\n", " \n", " self.__copy_input_htod_key_value(input_keys_host, input_values_host)\n", " self.radix_sort_key_value_ascending_device(self._input_keys_device, self._input_values_device, self._output_keys_device, self._output_values_device, num_elements)\n", " self.__copy_output_dtoh_key_value(output_keys_host, output_values_host)\n", "\n", " def radix_sort_key_value_descending_host(self, input_keys_host, input_values_host, output_keys_host, output_values_host):\n", "\n", " num_elements = input_keys_host.shape[0]\n", " \n", " self.__copy_input_htod_key_value(input_keys_host, input_values_host)\n", " self.radix_sort_key_value_descending_device(self._input_keys_device, self._input_values_device, self._output_keys_device, self._output_values_device, num_elements)\n", " self.__copy_output_dtoh_key_value(output_keys_host, output_values_host)\n" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 5 }, { "cell_type": "markdown", "metadata": {}, "source": [ "## invoke GPU radix sort class" ] }, { "cell_type": "code", "collapsed": false, "input": [ "n = 10000\n", "input_keys = (numpy.random.rand(n) * n).astype(numpy.uint32)\n", "input_values = input_keys.astype(numpy.float32)\n", "output_keys = numpy.zeros_like(input_keys)\n", "output_values = numpy.zeros_like(input_values)\n", "radix_sort_manager = RadixSortManager(15000)\n", "\n", "radix_sort_manager.radix_sort_key_value_ascending_host(input_keys, input_values, output_keys, output_values)\n", "\n", "print input_keys\n", "print input_values\n", "print output_keys\n", "print output_values\n", "print\n", "print \"Difference between GPU and CPU keys (should be 0.0%%): %f\" % numpy.linalg.norm(output_keys - numpy.sort(input_keys))\n", "print \"Difference between GPU and CPU values (should be 0.0%%): %f\" % numpy.linalg.norm(output_values - numpy.sort(input_keys).astype(float32))" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "[3229 8213 9674 ..., 5248 4543 1902]\n", "[ 3229. 8213. 9674. ..., 5248. 4543. 1902.]\n", "[ 0 5 5 ..., 9997 9997 9999]\n", "[ 0.00000000e+00 5.00000000e+00 5.00000000e+00 ..., 9.99700000e+03\n", " 9.99700000e+03 9.99900000e+03]\n", "\n", "Difference between GPU and CPU keys (should be 0.0%): 0.000000\n", "Difference between GPU and CPU values (should be 0.0%): 0.000000\n" ] }, { "output_type": "stream", "stream": "stderr", "text": [ "-c:172: RuntimeWarning: invalid value encountered in power\n" ] } ], "prompt_number": 6 }, { "cell_type": "markdown", "metadata": {}, "source": [ "## ascending vs. descending and key-value vs. key-only" ] }, { "cell_type": "code", "collapsed": false, "input": [ "n = 10\n", "input_keys = (numpy.random.rand(n) * n).astype(numpy.uint32)\n", "input_values = input_keys.astype(numpy.float32)\n", "output_keys = numpy.zeros_like(input_keys)\n", "output_values = numpy.zeros_like(input_values)\n", "radix_sort_manager = RadixSortManager(15000)\n", "\n", "radix_sort_manager.radix_sort_key_value_ascending_host(input_keys, input_values, output_keys, output_values)\n", "\n", "print input_keys\n", "print input_values\n", "print output_keys\n", "print output_values\n", "print\n", "\n", "\n", "\n", "n = 10\n", "input_keys = (numpy.random.rand(n) * n).astype(numpy.uint32)\n", "input_values = input_keys.astype(numpy.float32)\n", "output_keys = numpy.zeros_like(input_keys)\n", "output_values = numpy.zeros_like(input_values)\n", "radix_sort_manager = RadixSortManager(15000)\n", "\n", "radix_sort_manager.radix_sort_key_value_descending_host(input_keys, input_values, output_keys, output_values)\n", "\n", "print input_keys\n", "print input_values\n", "print output_keys\n", "print output_values\n", "print\n", "\n", "\n", "\n", "n = 10\n", "input_keys = (numpy.random.rand(n) * n).astype(numpy.uint32)\n", "input_values = input_keys.astype(numpy.float32)\n", "output_keys = numpy.zeros_like(input_keys)\n", "output_values = numpy.zeros_like(input_values)\n", "radix_sort_manager = RadixSortManager(15000)\n", "\n", "radix_sort_manager.radix_sort_key_ascending_host(input_keys, output_keys)\n", "\n", "print input_keys\n", "print output_keys\n", "print\n", "\n", "\n", "\n", "n = 10\n", "input_keys = (numpy.random.rand(n) * n).astype(numpy.uint32)\n", "input_values = input_keys.astype(numpy.float32)\n", "output_keys = numpy.zeros_like(input_keys)\n", "output_values = numpy.zeros_like(input_values)\n", "radix_sort_manager = RadixSortManager(15000)\n", "\n", "radix_sort_manager.radix_sort_key_descending_host(input_keys, output_keys)\n", "\n", "print input_keys\n", "print output_keys\n", "print\n" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "[9 8 8 2 4 3 3 8 4 6]\n", "[ 9. 8. 8. 2. 4. 3. 3. 8. 4. 6.]\n", "[2 3 3 4 4 6 8 8 8 9]\n", "[ 2. 3. 3. 4. 4. 6. 8. 8. 8. 9.]\n", "\n", "[8 9 8 4 2 1 1 8 8 5]" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "[ 8. 9. 8. 4. 2. 1. 1. 8. 8. 5.]\n", "[9 8 8 8 8 5 4 2 1 1]\n", "[ 9. 8. 8. 8. 8. 5. 4. 2. 1. 1.]\n", "\n", "[7 6 8 7 4 4 9 7 8 5]" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "[4 4 5 6 7 7 7 8 8 9]\n", "\n", "[6 5 8 6 5 0 6 0 6 3]" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "[8 6 6 6 6 5 5 3 0 0]\n", "\n" ] }, { "output_type": "stream", "stream": "stderr", "text": [ "-c:137: RuntimeWarning: invalid value encountered in power\n" ] } ], "prompt_number": 7 }, { "cell_type": "markdown", "metadata": {}, "source": [ "## floating point keys" ] }, { "cell_type": "code", "collapsed": false, "input": [ "n = 10000\n", "input_keys = (numpy.random.rand(n) * n).astype(numpy.float32)\n", "output_keys = numpy.zeros_like(input_keys)\n", "radix_sort_manager = RadixSortManager(15000)\n", "\n", "radix_sort_manager.radix_sort_key_ascending_host(input_keys, output_keys)\n", "\n", "print input_keys\n", "print output_keys\n", "print\n", "print \"Difference between GPU and CPU keys (should be 0.0%%): %f\" % numpy.linalg.norm(output_keys - numpy.sort(input_keys))" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "[ 4938.57958984 9086.15136719 9926.49511719 ..., 9953.21875\n", " 5300.93457031 2960.62329102]\n", "[ 1.85716784e+00 2.97219419e+00 3.74738264e+00 ..., 9.99855371e+03\n", " 9.99873047e+03 9.99973535e+03]\n", "\n", "Difference between GPU and CPU keys (should be 0.0%): 0.000000\n" ] } ], "prompt_number": 8 } ], "metadata": {} } ] }