{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Test Burrows Wheeler Transform (BWT)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "This is a simple notebook to test the implementation of the BWT." ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "from Burrows_Wheeler_Transform import transform\n", "from Burrows_Wheeler_Transform import inverse\n", "import time" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Corectness of the implementation " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "First we test the correctness of the implementation for the transform using some known BWT. We can have as input a string with or without '$'\n" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "The BWT of the string Banana is: a$nnBaa\n", "The BWT of the string mississippi is: ipssm$pissii\n" ] } ], "source": [ "finalString = transform('Banana')\n", "if finalString == 'a$nnBaa':\n", " print('The BWT of the string Banana is: %s' %(finalString))\n", "\n", "\n", "finalString = transform('mississippi$')\n", "\n", "if finalString == 'ipssm$pissii':\n", " print('The BWT of the string mississippi is: %s' %(finalString))\n", " \n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Then we test the correctness of the implementation for the inverse of BWT" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "The original string is: Banana\n", "The original string is: mississippi\n" ] } ], "source": [ "originalString = inverse('a$nnBaa')\n", "if originalString == 'Banana':\n", " print('The original string is: %s' %(originalString))\n", " \n", "\n", "originalString = inverse('ipssm$pissii')\n", "if originalString == 'mississippi':\n", " print('The original string is: %s' %(originalString))\n", " " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Common error\n", "This is a list of common error that an user could make " ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "ename": "invalid_string_error", "evalue": "The input mississippi$$ cointains more than one $", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31minvalid_string_error\u001b[0m Traceback (most recent call last)", "\u001b[0;32m<ipython-input-4-b7d159470ec6>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;31m# transform with more then one $\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mfinalString\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtransform\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'mississippi$$'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mfinalString\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m'ipssm$pissii'\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'The BWT of the string mississippi is: %s'\u001b[0m \u001b[0;34m%\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfinalString\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m~/Documents/Università/Magistrale/Primo anno/Secondo semestre/scientific programming/ProgrammingPython/Burrows_Wheeler_Transform/functions.py\u001b[0m in \u001b[0;36mtransform\u001b[0;34m(string)\u001b[0m\n\u001b[1;32m 18\u001b[0m \u001b[0;31m# the string must contains only one termination char\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 19\u001b[0m \u001b[0;32melif\u001b[0m \u001b[0mstring\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcount\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'$'\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m>\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 20\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0minvalid_string_error\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'The input '\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mstring\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m' cointains more than one $'\u001b[0m \u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 21\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 22\u001b[0m \u001b[0;31m# check if the last char contains the '$' otherwhise add it\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31minvalid_string_error\u001b[0m: The input mississippi$$ cointains more than one $" ] } ], "source": [ "# transform with more then one $\n", "finalString = transform('mississippi$$')\n", "\n", "if finalString == 'ipssm$pissii':\n", " print('The BWT of the string mississippi is: %s' %(finalString))" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "ename": "invalid_string_error", "evalue": "The input annBaa does not contain exactly one $ character", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31minvalid_string_error\u001b[0m Traceback (most recent call last)", "\u001b[0;32m<ipython-input-5-163491cab8ac>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;31m# inverse string without the $ character\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0moriginalString\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0minverse\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'annBaa'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0moriginalString\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m'Banana'\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'The original string is: %s'\u001b[0m \u001b[0;34m%\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0moriginalString\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m~/Documents/Università/Magistrale/Primo anno/Secondo semestre/scientific programming/ProgrammingPython/Burrows_Wheeler_Transform/functions.py\u001b[0m in \u001b[0;36minverse\u001b[0;34m(bwt)\u001b[0m\n\u001b[1;32m 44\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mnot_a_string_error\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'The input '\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mbwt\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m' is not a string'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 45\u001b[0m \u001b[0;32melif\u001b[0m \u001b[0mbwt\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcount\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'$'\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m!=\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 46\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0minvalid_string_error\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'The input '\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mbwt\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m' does not contain exactly one $ character'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 47\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 48\u001b[0m \u001b[0mn\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mbwt\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31minvalid_string_error\u001b[0m: The input annBaa does not contain exactly one $ character" ] } ], "source": [ "# inverse string without the $ character \n", "originalString = inverse('annBaa')\n", "if originalString == 'Banana':\n", " print('The original string is: %s' %(originalString)) " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Performance test " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Now that we have test the correctness of the implementation we will test the speed of this implementation, in particular we are interesting in smoothly handle string with 1000 characters." ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "with open('TestSample/oneThousandChar.txt') as file:\n", " oneChar = file.read().replace('\\n', '')\n", "\n", " \n", " \n", "with open('TestSample/twoThousandChar.txt') as file:\n", " twoChar = file.read().replace('\\n', '')\n", "\n", "\n", "with open('TestSample/twelveThousandChar.txt') as file:\n", " twelveChar = file.read().replace('\\n', '')\n", " \n", "with open('TestSample/twentyFourChar.txt') as file:\n", " twentyFourChar = file.read().replace('\\n', '')" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "The time for the transform is: \n", "CPU times: user 11.5 ms, sys: 4.61 ms, total: 16.1 ms\n", "Wall time: 14.4 ms\n", "The time for the inverse is: \n", "CPU times: user 23.5 ms, sys: 2.77 ms, total: 26.3 ms\n", "Wall time: 24.9 ms\n" ] } ], "source": [ "# compute the time for the transform of one thousand characters \n", "print('The time for the transform is: ')\n", "%time finalString = transform(oneChar)\n", "\n", "# compute the time for the inverse of one thousand characters \n", "print('The time for the inverse is: ')\n", "%time inverseTransf = inverse(finalString)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "The time for the transform is: \n", "CPU times: user 36 ms, sys: 15.9 ms, total: 51.9 ms\n", "Wall time: 51 ms\n", "The time for the inverse is: \n" ] } ], "source": [ "# compute the time for the transform of two thousand characters \n", "print('The time for the transform is: ')\n", "%time finalString = transform(twoChar)\n", "\n", "# compute the time for the inverse of two thousand characters \n", "print('The time for the inverse is: ')\n", "inverseTransf = inverse(finalString)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "The time for the transform is: \n", "CPU times: user 1.19 s, sys: 470 ms, total: 1.66 s\n", "Wall time: 1.66 s\n", "The time for the inverse is: \n", "CPU times: user 1.61 s, sys: 10.8 ms, total: 1.63 s\n", "Wall time: 1.64 s\n" ] } ], "source": [ "# compute the time for the transform of twelve thousand characters\n", "print('The time for the transform is: ')\n", "%time finalString = transform(twelveChar)\n", "\n", "# compute the time for the inverse of twelve thousand characters \n", "print('The time for the inverse is: ')\n", "%time inverseTransf = inverse(finalString)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "The time for the transform is: \n", "CPU times: user 1.21 s, sys: 451 ms, total: 1.66 s\n", "Wall time: 1.66 s\n", "The time for the inverse is: \n", "CPU times: user 1.6 s, sys: 7.79 ms, total: 1.61 s\n", "Wall time: 1.62 s\n" ] } ], "source": [ "# compute the time for the transform of twenty-four thousand characters \n", "print('The time for the transform is: ')\n", "%time finalString = transform(twentyFourChar)\n", "\n", "\n", "# compute the time for the inverse of twenty-four thousand characters \n", "print('The time for the inverse is: ')\n", "%time inverseTransf = inverse(finalString)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.6" } }, "nbformat": 4, "nbformat_minor": 4 }