{ "cells": [ { "cell_type": "markdown", "metadata": { "collapsed": true, "deletable": true, "editable": true }, "source": "# Build PNG Files\n\nIn this notebook, we'll take the `basic` data set, use `ibmseti` Python package to convert each data file into a spectrogram, then save as `.png` files.\n\n\nAlso, we'll split the data set into a training set and a test set and create a handful of zip files for each class. This will dovetail into the next tutorial where we will train a custom Watson Visual Recognition classifier (we will use the zip files of pngs) and measure it's performance with the test set. " }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": true, "deletable": true, "editable": true }, "outputs": [], "source": "from __future__ import division\n\nimport cStringIO\nimport glob\nimport json\nimport requests\nimport ibmseti\nimport os\nimport zipfile\nimport numpy as np\nimport matplotlib.pyplot as plt" }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": true, "deletable": true, "editable": true }, "outputs": [], "source": "#Making a local folder to put my data.\n\n#NOTE: YOU MUST do something like this on a Spark Enterprise cluster at the hackathon so that\n#you can put your data into a separate local file space. Otherwise, you'll likely collide with \n#your fellow participants. \n\nmydatafolder = os.environ['PWD'] + '/' + 'my_team_name_data_folder'\nif os.path.exists(mydatafolder) is False:\n os.makedirs(mydatafolder)" }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": true, "deletable": true, "editable": true }, "outputs": [], "source": "#If you are running this in IBM Apache Spark (via Data Science Experience)\nbase_url = 'https://dal05.objectstorage.service.networklayer.com/v1/AUTH_cdbef52bdf7a449c96936e1071f0a46b'\n\n#ELSE, if you are outside of IBM:\n#base_url = 'https://dal.objectstorage.open.softlayer.com/v1/AUTH_cdbef52bdf7a449c96936e1071f0a46b'\n\n#NOTE: if you are outside of IBM, pulling down data will be slower. :/" }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [ { "data": { "text/plain": "0" }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": "## You don't need to repeat this, of course, if you've already done this in the Step 1 notebook\n\nbasic4zip = '{}/simsignals_basic_v2/basic4.zip'.format(base_url)\nos.system('curl {} > {}/{}'.format(basic4zip, mydatafolder, 'basic4.zip'))" }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": "total 3.3G\r\ndrwxr-xr-x 6 sd2d-634b36332a0fab-8605aaf2c6e1 users 4.0K Jun 8 18:38 ..\r\ndrwx------ 2 sd2d-634b36332a0fab-8605aaf2c6e1 users 4.0K Jun 8 18:38 .\r\n-rw------- 1 sd2d-634b36332a0fab-8605aaf2c6e1 users 1.1G Jun 8 18:38 basic4.zip\r\n" } ], "source": "!ls -alrht $mydatafolder" }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": true, "deletable": true, "editable": true }, "outputs": [], "source": "outputpng_folder = mydatafolder + '/png'\nif os.path.exists(outputpng_folder) is False:\n os.makedirs(outputpng_folder)" }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": "zz = zipfile.ZipFile(mydatafolder + '/' + 'basic4.zip')" }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": true, "deletable": true, "editable": true }, "outputs": [], "source": "#Use `ibmseti`, or other methods, to draw the spectrograms\n\ndef draw_spectrogram(data):\n \n aca = ibmseti.compamp.SimCompamp(data)\n spec = aca.get_spectrogram()\n\n # Instead of using SimCompAmp.get_spectrogram method\n # perform your own signal processing here before you create the spectrogram\n #\n # SimCompAmp.get_spectrogram is relatively simple. Here's the code to reproduce it:\n #\n # header, raw_data = r.content.split('\\n',1)\n # complex_data = np.frombuffer(raw_data, dtype='i1').astype(np.float32).view(np.complex64)\n # shape = (32, 6144)\n # spec = np.abs( np.fft.fftshift( np.fft.fft( complex_data.reshape(*shape) ), 1) )**2\n # \n # But instead of the line above, can you maniputlate `complex_data` with signal processing\n # techniques in the time-domain (windowing?, de-chirp?), or manipulate the output of the \n # np.fft.fft process in a way to improve the signal to noise (Welch periodogram, subtract noise model)? \n # \n # example: Apply Hanning Window\n # complex_data = complex_data.reshape(*shape)\n # complex_data = complex_data * np.hanning(complex_data.shape[1])\n # spec = np.abs( np.fft.fftshift( np.fft.fft( complex_data ), 1) )**2\n\n\n fig, ax = plt.subplots(figsize=(10, 5)) \n\n # do different color mappings affect Watson's classification accuracy?\n # ax.imshow(np.log(spec), aspect = 0.5*float(spec.shape[1]) / spec.shape[0], cmap='hot')\n # ax.imshow(np.log(spec), aspect = 0.5*float(spec.shape[1]) / spec.shape[0], cmap='gray')\n # ax.imshow(np.log(spec), aspect = 0.5*float(spec.shape[1]) / spec.shape[0], cmap='Greys')\n \n ax.imshow(np.log(spec), aspect = 0.5*float(spec.shape[1]) / spec.shape[0])\n \n return fig, aca.header()\n" }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": true, "deletable": true, "editable": true }, "outputs": [], "source": "## We're going to use Spark to distribute the job of creating the PNGs on the executor nodes\n\nrdd = sc.parallelize(zz.namelist(), 120) #30 executors are available on Enterprise clusters" }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": "def extract_data(row):\n return (row, zz.open(row).read())\n\nrdd = rdd.map(extract_data)" }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": "def convert_to_spectrogram_and_save(row):\n name = row[0]\n fig, header = draw_spectrogram(row[1])\n png_file = name + '.png'\n fig.savefig(outputpng_folder + '/' + png_file)\n plt.close(fig)\n return (name, header, png_file)" }, { "cell_type": "code", "execution_count": 12, "metadata": { "collapsed": true, "deletable": true, "editable": true }, "outputs": [], "source": "rdd = rdd.map(convert_to_spectrogram_and_save)" }, { "cell_type": "code", "execution_count": 13, "metadata": { "collapsed": true, "deletable": true, "editable": true }, "outputs": [], "source": "results = rdd.collect() #This took about 70s on my Enterprise cluster. It will take longer on your free-tier. " }, { "cell_type": "code", "execution_count": 14, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [ { "data": { "text/plain": "('000919a5-bc7f-471e-959c-81adba0b1f36.dat',\n {u'signal_classification': u'squiggle',\n u'uuid': u'000919a5-bc7f-471e-959c-81adba0b1f36'},\n '000919a5-bc7f-471e-959c-81adba0b1f36.dat.png')" }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": "results[0]" }, { "cell_type": "markdown", "metadata": { "deletable": true, "editable": true }, "source": "# Create Training / Test sets\n\nUsing the `basic` list, we'll create training and test sets for each signal class. Then we'll archive the `.png` files into a handful of `.zip` files (We need the .zip files to be smaller than 100 MB because there is a limitation with the size of batches of data that are uploaded to Watson Visual Recognition when training a classifier.)" }, { "cell_type": "code", "execution_count": 15, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": "found 4000 files\n" } ], "source": "# Grab the Basic file list in order to \n# Organize the Data into classes\n\nr = requests.get('{}/simsignals_files/public_list_basic_v2_26may_2017.csv'.format(base_url), timeout=(9.0, 21.0))\n\nuuids_classes_as_list = r.text.split('\\n')[1:-1] #slice off the first line (header) and last line (empty)\n\ndef row_to_json(row):\n uuid,sigclass = row.split(',')\n return {'uuid':uuid, 'signal_classification':sigclass}\n\nuuids_classes_as_list = map(lambda row: row_to_json(row), uuids_classes_as_list)\nprint \"found {} files\".format(len(uuids_classes_as_list))\n\nuuids_group_by_class = {}\nfor item in uuids_classes_as_list:\n uuids_group_by_class.setdefault(item['signal_classification'], []).append(item)" }, { "cell_type": "code", "execution_count": 16, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": "squiggle: training set size: 700\nsquiggle: test set size: 300\nnarrowband: training set size: 700\nnarrowband: test set size: 300\nnoise: training set size: 700\nnoise: test set size: 300\nnarrowbanddrd: training set size: 700\nnarrowbanddrd: test set size: 300\n" } ], "source": "training_percentage = 0.70\n\ntraining_set_group_by_class = {}\ntest_set_group_by_class = {}\nfor k, v in uuids_group_by_class.iteritems():\n \n total = len(v)\n training_size = int(total * training_percentage)\n\n training_set = v[0:training_size]\n test_set = v[training_size:total]\n \n training_set_group_by_class[k] = training_set\n test_set_group_by_class[k] = test_set\n \n print '{}: training set size: {}'.format(k, len(training_set))\n print '{}: test set size: {}'.format(k, len(test_set))" }, { "cell_type": "code", "execution_count": 17, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [ { "data": { "text/plain": "{'signal_classification': u'noise',\n 'uuid': u'498becc2-3693-45b3-8533-50e93532706a'}" }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": "training_set_group_by_class['noise'][0]" }, { "cell_type": "code", "execution_count": 18, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": "fnames = [outputpng_folder + '/' + vv['uuid'] + '.dat.png' for vv in v]" }, { "cell_type": "code", "execution_count": 19, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": "zipfilefolder = mydatafolder + '/zipfiles'\nif os.path.exists(zipfilefolder) is False:\n os.makedirs(zipfilefolder)" }, { "cell_type": "code", "execution_count": 20, "metadata": { "collapsed": true }, "outputs": [], "source": "max_zip_file_size_in_mb = 25" }, { "cell_type": "code", "execution_count": 21, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": "creating new archive /gpfs/fs01/user/sd2d-634b36332a0fab-8605aaf2c6e1/notebook/work/my_team_name_data_folder/zipfiles/classification_1_squiggle.zip\ncreating new archive /gpfs/fs01/user/sd2d-634b36332a0fab-8605aaf2c6e1/notebook/work/my_team_name_data_folder/zipfiles/classification_2_squiggle.zip\ncreating new archive /gpfs/fs01/user/sd2d-634b36332a0fab-8605aaf2c6e1/notebook/work/my_team_name_data_folder/zipfiles/classification_3_squiggle.zip\ncreating new archive /gpfs/fs01/user/sd2d-634b36332a0fab-8605aaf2c6e1/notebook/work/my_team_name_data_folder/zipfiles/classification_4_squiggle.zip\ncreating new archive /gpfs/fs01/user/sd2d-634b36332a0fab-8605aaf2c6e1/notebook/work/my_team_name_data_folder/zipfiles/classification_5_squiggle.zip\ncreating new archive /gpfs/fs01/user/sd2d-634b36332a0fab-8605aaf2c6e1/notebook/work/my_team_name_data_folder/zipfiles/classification_6_squiggle.zip\ncreating new archive /gpfs/fs01/user/sd2d-634b36332a0fab-8605aaf2c6e1/notebook/work/my_team_name_data_folder/zipfiles/classification_7_squiggle.zip\ncreating new archive /gpfs/fs01/user/sd2d-634b36332a0fab-8605aaf2c6e1/notebook/work/my_team_name_data_folder/zipfiles/classification_8_squiggle.zip\ncreating new archive /gpfs/fs01/user/sd2d-634b36332a0fab-8605aaf2c6e1/notebook/work/my_team_name_data_folder/zipfiles/classification_9_squiggle.zip\ncreating new archive /gpfs/fs01/user/sd2d-634b36332a0fab-8605aaf2c6e1/notebook/work/my_team_name_data_folder/zipfiles/classification_10_squiggle.zip\ncreating new archive /gpfs/fs01/user/sd2d-634b36332a0fab-8605aaf2c6e1/notebook/work/my_team_name_data_folder/zipfiles/classification_1_narrowband.zip\ncreating new archive /gpfs/fs01/user/sd2d-634b36332a0fab-8605aaf2c6e1/notebook/work/my_team_name_data_folder/zipfiles/classification_2_narrowband.zip\ncreating new archive /gpfs/fs01/user/sd2d-634b36332a0fab-8605aaf2c6e1/notebook/work/my_team_name_data_folder/zipfiles/classification_3_narrowband.zip\ncreating new archive /gpfs/fs01/user/sd2d-634b36332a0fab-8605aaf2c6e1/notebook/work/my_team_name_data_folder/zipfiles/classification_4_narrowband.zip\ncreating new archive /gpfs/fs01/user/sd2d-634b36332a0fab-8605aaf2c6e1/notebook/work/my_team_name_data_folder/zipfiles/classification_5_narrowband.zip\ncreating new archive /gpfs/fs01/user/sd2d-634b36332a0fab-8605aaf2c6e1/notebook/work/my_team_name_data_folder/zipfiles/classification_6_narrowband.zip\ncreating new archive /gpfs/fs01/user/sd2d-634b36332a0fab-8605aaf2c6e1/notebook/work/my_team_name_data_folder/zipfiles/classification_7_narrowband.zip\ncreating new archive /gpfs/fs01/user/sd2d-634b36332a0fab-8605aaf2c6e1/notebook/work/my_team_name_data_folder/zipfiles/classification_8_narrowband.zip\ncreating new archive /gpfs/fs01/user/sd2d-634b36332a0fab-8605aaf2c6e1/notebook/work/my_team_name_data_folder/zipfiles/classification_9_narrowband.zip\ncreating new archive /gpfs/fs01/user/sd2d-634b36332a0fab-8605aaf2c6e1/notebook/work/my_team_name_data_folder/zipfiles/classification_10_narrowband.zip\ncreating new archive /gpfs/fs01/user/sd2d-634b36332a0fab-8605aaf2c6e1/notebook/work/my_team_name_data_folder/zipfiles/classification_1_noise.zip\ncreating new archive /gpfs/fs01/user/sd2d-634b36332a0fab-8605aaf2c6e1/notebook/work/my_team_name_data_folder/zipfiles/classification_2_noise.zip\ncreating new archive /gpfs/fs01/user/sd2d-634b36332a0fab-8605aaf2c6e1/notebook/work/my_team_name_data_folder/zipfiles/classification_3_noise.zip\ncreating new archive /gpfs/fs01/user/sd2d-634b36332a0fab-8605aaf2c6e1/notebook/work/my_team_name_data_folder/zipfiles/classification_4_noise.zip\ncreating new archive /gpfs/fs01/user/sd2d-634b36332a0fab-8605aaf2c6e1/notebook/work/my_team_name_data_folder/zipfiles/classification_5_noise.zip\ncreating new archive /gpfs/fs01/user/sd2d-634b36332a0fab-8605aaf2c6e1/notebook/work/my_team_name_data_folder/zipfiles/classification_6_noise.zip\ncreating new archive /gpfs/fs01/user/sd2d-634b36332a0fab-8605aaf2c6e1/notebook/work/my_team_name_data_folder/zipfiles/classification_7_noise.zip\ncreating new archive /gpfs/fs01/user/sd2d-634b36332a0fab-8605aaf2c6e1/notebook/work/my_team_name_data_folder/zipfiles/classification_8_noise.zip\ncreating new archive /gpfs/fs01/user/sd2d-634b36332a0fab-8605aaf2c6e1/notebook/work/my_team_name_data_folder/zipfiles/classification_9_noise.zip\ncreating new archive /gpfs/fs01/user/sd2d-634b36332a0fab-8605aaf2c6e1/notebook/work/my_team_name_data_folder/zipfiles/classification_10_noise.zip\ncreating new archive /gpfs/fs01/user/sd2d-634b36332a0fab-8605aaf2c6e1/notebook/work/my_team_name_data_folder/zipfiles/classification_1_narrowbanddrd.zip\ncreating new archive /gpfs/fs01/user/sd2d-634b36332a0fab-8605aaf2c6e1/notebook/work/my_team_name_data_folder/zipfiles/classification_2_narrowbanddrd.zip\ncreating new archive /gpfs/fs01/user/sd2d-634b36332a0fab-8605aaf2c6e1/notebook/work/my_team_name_data_folder/zipfiles/classification_3_narrowbanddrd.zip\ncreating new archive /gpfs/fs01/user/sd2d-634b36332a0fab-8605aaf2c6e1/notebook/work/my_team_name_data_folder/zipfiles/classification_4_narrowbanddrd.zip\ncreating new archive /gpfs/fs01/user/sd2d-634b36332a0fab-8605aaf2c6e1/notebook/work/my_team_name_data_folder/zipfiles/classification_5_narrowbanddrd.zip\ncreating new archive /gpfs/fs01/user/sd2d-634b36332a0fab-8605aaf2c6e1/notebook/work/my_team_name_data_folder/zipfiles/classification_6_narrowbanddrd.zip\ncreating new archive /gpfs/fs01/user/sd2d-634b36332a0fab-8605aaf2c6e1/notebook/work/my_team_name_data_folder/zipfiles/classification_7_narrowbanddrd.zip\ncreating new archive /gpfs/fs01/user/sd2d-634b36332a0fab-8605aaf2c6e1/notebook/work/my_team_name_data_folder/zipfiles/classification_8_narrowbanddrd.zip\ncreating new archive /gpfs/fs01/user/sd2d-634b36332a0fab-8605aaf2c6e1/notebook/work/my_team_name_data_folder/zipfiles/classification_9_narrowbanddrd.zip\ncreating new archive /gpfs/fs01/user/sd2d-634b36332a0fab-8605aaf2c6e1/notebook/work/my_team_name_data_folder/zipfiles/classification_10_narrowbanddrd.zip\n" } ], "source": "#Create the Zip files containing the training PNG files\n#Note that this limits output files to be less than MB because WatsonVR has a limit on the \n#size of input files that can be sent in single HTTP calls to train a custom classifier\n\nfor k, v, in training_set_group_by_class.iteritems():\n \n fnames = [outputpng_folder + '/' + vv['uuid'] + '.dat.png' for vv in v] #yes, files are .dat.png :/\n \n count = 1\n for fn in fnames:\n \n archive_name = '{}/classification_{}_{}.zip'.format(zipfilefolder, count, k)\n \n if os.path.exists(archive_name):\n zz = zipfile.ZipFile(archive_name, mode='a')\n else:\n print 'creating new archive', archive_name\n zz = zipfile.ZipFile(archive_name, mode='w')\n \n zz.write(fn)\n zz.close()\n \n #if archive_name folder exceeds MB, increase count to create a new one\n if os.path.getsize(archive_name) > max_zip_file_size_in_mb * 1024 ** 2:\n count += 1\n " }, { "cell_type": "code", "execution_count": 22, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": "creating new archive /gpfs/fs01/user/sd2d-634b36332a0fab-8605aaf2c6e1/notebook/work/my_team_name_data_folder/zipfiles/testset_1_squiggle.zip\ncreating new archive /gpfs/fs01/user/sd2d-634b36332a0fab-8605aaf2c6e1/notebook/work/my_team_name_data_folder/zipfiles/testset_2_squiggle.zip\ncreating new archive /gpfs/fs01/user/sd2d-634b36332a0fab-8605aaf2c6e1/notebook/work/my_team_name_data_folder/zipfiles/testset_3_squiggle.zip\ncreating new archive /gpfs/fs01/user/sd2d-634b36332a0fab-8605aaf2c6e1/notebook/work/my_team_name_data_folder/zipfiles/testset_4_squiggle.zip\ncreating new archive /gpfs/fs01/user/sd2d-634b36332a0fab-8605aaf2c6e1/notebook/work/my_team_name_data_folder/zipfiles/testset_1_narrowband.zip\ncreating new archive /gpfs/fs01/user/sd2d-634b36332a0fab-8605aaf2c6e1/notebook/work/my_team_name_data_folder/zipfiles/testset_2_narrowband.zip\ncreating new archive /gpfs/fs01/user/sd2d-634b36332a0fab-8605aaf2c6e1/notebook/work/my_team_name_data_folder/zipfiles/testset_3_narrowband.zip\ncreating new archive /gpfs/fs01/user/sd2d-634b36332a0fab-8605aaf2c6e1/notebook/work/my_team_name_data_folder/zipfiles/testset_4_narrowband.zip\ncreating new archive /gpfs/fs01/user/sd2d-634b36332a0fab-8605aaf2c6e1/notebook/work/my_team_name_data_folder/zipfiles/testset_5_narrowband.zip\ncreating new archive /gpfs/fs01/user/sd2d-634b36332a0fab-8605aaf2c6e1/notebook/work/my_team_name_data_folder/zipfiles/testset_1_noise.zip\ncreating new archive /gpfs/fs01/user/sd2d-634b36332a0fab-8605aaf2c6e1/notebook/work/my_team_name_data_folder/zipfiles/testset_2_noise.zip\ncreating new archive /gpfs/fs01/user/sd2d-634b36332a0fab-8605aaf2c6e1/notebook/work/my_team_name_data_folder/zipfiles/testset_3_noise.zip\ncreating new archive /gpfs/fs01/user/sd2d-634b36332a0fab-8605aaf2c6e1/notebook/work/my_team_name_data_folder/zipfiles/testset_4_noise.zip\ncreating new archive /gpfs/fs01/user/sd2d-634b36332a0fab-8605aaf2c6e1/notebook/work/my_team_name_data_folder/zipfiles/testset_1_narrowbanddrd.zip\ncreating new archive /gpfs/fs01/user/sd2d-634b36332a0fab-8605aaf2c6e1/notebook/work/my_team_name_data_folder/zipfiles/testset_2_narrowbanddrd.zip\ncreating new archive /gpfs/fs01/user/sd2d-634b36332a0fab-8605aaf2c6e1/notebook/work/my_team_name_data_folder/zipfiles/testset_3_narrowbanddrd.zip\ncreating new archive /gpfs/fs01/user/sd2d-634b36332a0fab-8605aaf2c6e1/notebook/work/my_team_name_data_folder/zipfiles/testset_4_narrowbanddrd.zip\ncreating new archive /gpfs/fs01/user/sd2d-634b36332a0fab-8605aaf2c6e1/notebook/work/my_team_name_data_folder/zipfiles/testset_5_narrowbanddrd.zip\n" } ], "source": "#Create the Zip files containing the test PNG files\n#Note that this limits output files to be less than MB because WatsonVR has a limit on the \n#size of input files that can be sent in single HTTP calls to train a custom classifier\n\nfor k, v, in test_set_group_by_class.iteritems():\n \n fnames = [outputpng_folder + '/' + vv['uuid'] + '.dat.png' for vv in v] #yes, files are .dat.png :/\n \n count = 1\n for fn in fnames:\n \n archive_name = '{}/testset_{}_{}.zip'.format(zipfilefolder, count, k)\n \n if os.path.exists(archive_name):\n zz = zipfile.ZipFile(archive_name, mode='a')\n else:\n print 'creating new archive', archive_name\n zz = zipfile.ZipFile(archive_name, mode='w')\n \n zz.write(fn)\n zz.close()\n \n #if archive_name folder exceeds MB, increase count to create a new one\n if os.path.getsize(archive_name) > max_zip_file_size_in_mb * 1024 ** 2:\n count += 1\n " }, { "cell_type": "code", "execution_count": 23, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": "total 4.0G\r\ndrwx------ 4 sd2d-634b36332a0fab-8605aaf2c6e1 users 4.0K Jun 8 18:41 ..\r\n-rw------- 1 sd2d-634b36332a0fab-8605aaf2c6e1 users 26M Jun 8 18:41 classification_1_squiggle.zip\r\n-rw------- 1 sd2d-634b36332a0fab-8605aaf2c6e1 users 26M Jun 8 18:41 classification_2_squiggle.zip\r\n-rw------- 1 sd2d-634b36332a0fab-8605aaf2c6e1 users 26M Jun 8 18:42 classification_3_squiggle.zip\r\n-rw------- 1 sd2d-634b36332a0fab-8605aaf2c6e1 users 26M Jun 8 18:42 classification_4_squiggle.zip\r\n-rw------- 1 sd2d-634b36332a0fab-8605aaf2c6e1 users 26M Jun 8 18:42 classification_5_squiggle.zip\r\n-rw------- 1 sd2d-634b36332a0fab-8605aaf2c6e1 users 26M Jun 8 18:42 classification_6_squiggle.zip\r\n-rw------- 1 sd2d-634b36332a0fab-8605aaf2c6e1 users 26M Jun 8 18:42 classification_7_squiggle.zip\r\n-rw------- 1 sd2d-634b36332a0fab-8605aaf2c6e1 users 26M Jun 8 18:42 classification_8_squiggle.zip\r\n-rw------- 1 sd2d-634b36332a0fab-8605aaf2c6e1 users 26M Jun 8 18:42 classification_9_squiggle.zip\r\n-rw------- 1 sd2d-634b36332a0fab-8605aaf2c6e1 users 11M Jun 8 18:42 classification_10_squiggle.zip\r\n-rw------- 1 sd2d-634b36332a0fab-8605aaf2c6e1 users 26M Jun 8 18:42 classification_1_narrowband.zip\r\n-rw------- 1 sd2d-634b36332a0fab-8605aaf2c6e1 users 26M Jun 8 18:42 classification_2_narrowband.zip\r\n-rw------- 1 sd2d-634b36332a0fab-8605aaf2c6e1 users 26M Jun 8 18:42 classification_3_narrowband.zip\r\n-rw------- 1 sd2d-634b36332a0fab-8605aaf2c6e1 users 26M Jun 8 18:42 classification_4_narrowband.zip\r\n-rw------- 1 sd2d-634b36332a0fab-8605aaf2c6e1 users 26M Jun 8 18:42 classification_5_narrowband.zip\r\n-rw------- 1 sd2d-634b36332a0fab-8605aaf2c6e1 users 26M Jun 8 18:43 classification_6_narrowband.zip\r\n-rw------- 1 sd2d-634b36332a0fab-8605aaf2c6e1 users 26M Jun 8 18:43 classification_7_narrowband.zip\r\n-rw------- 1 sd2d-634b36332a0fab-8605aaf2c6e1 users 26M Jun 8 18:43 classification_8_narrowband.zip\r\n-rw------- 1 sd2d-634b36332a0fab-8605aaf2c6e1 users 26M Jun 8 18:43 classification_9_narrowband.zip\r\n-rw------- 1 sd2d-634b36332a0fab-8605aaf2c6e1 users 16M Jun 8 18:43 classification_10_narrowband.zip\r\n-rw------- 1 sd2d-634b36332a0fab-8605aaf2c6e1 users 26M Jun 8 18:43 classification_1_noise.zip\r\n-rw------- 1 sd2d-634b36332a0fab-8605aaf2c6e1 users 26M Jun 8 18:43 classification_2_noise.zip\r\n-rw------- 1 sd2d-634b36332a0fab-8605aaf2c6e1 users 26M Jun 8 18:43 classification_3_noise.zip\r\n-rw------- 1 sd2d-634b36332a0fab-8605aaf2c6e1 users 26M Jun 8 18:43 classification_4_noise.zip\r\n-rw------- 1 sd2d-634b36332a0fab-8605aaf2c6e1 users 26M Jun 8 18:43 classification_5_noise.zip\r\n-rw------- 1 sd2d-634b36332a0fab-8605aaf2c6e1 users 26M Jun 8 18:43 classification_6_noise.zip\r\n-rw------- 1 sd2d-634b36332a0fab-8605aaf2c6e1 users 26M Jun 8 18:43 classification_7_noise.zip\r\n-rw------- 1 sd2d-634b36332a0fab-8605aaf2c6e1 users 26M Jun 8 18:44 classification_8_noise.zip\r\n-rw------- 1 sd2d-634b36332a0fab-8605aaf2c6e1 users 26M Jun 8 18:44 classification_9_noise.zip\r\n-rw------- 1 sd2d-634b36332a0fab-8605aaf2c6e1 users 987K Jun 8 18:44 classification_10_noise.zip\r\n-rw------- 1 sd2d-634b36332a0fab-8605aaf2c6e1 users 26M Jun 8 18:44 classification_1_narrowbanddrd.zip\r\n-rw------- 1 sd2d-634b36332a0fab-8605aaf2c6e1 users 26M Jun 8 18:44 classification_2_narrowbanddrd.zip\r\n-rw------- 1 sd2d-634b36332a0fab-8605aaf2c6e1 users 26M Jun 8 18:44 classification_3_narrowbanddrd.zip\r\n-rw------- 1 sd2d-634b36332a0fab-8605aaf2c6e1 users 26M Jun 8 18:44 classification_4_narrowbanddrd.zip\r\n-rw------- 1 sd2d-634b36332a0fab-8605aaf2c6e1 users 26M Jun 8 18:44 classification_5_narrowbanddrd.zip\r\n-rw------- 1 sd2d-634b36332a0fab-8605aaf2c6e1 users 26M Jun 8 18:44 classification_6_narrowbanddrd.zip\r\n-rw------- 1 sd2d-634b36332a0fab-8605aaf2c6e1 users 26M Jun 8 18:44 classification_7_narrowbanddrd.zip\r\n-rw------- 1 sd2d-634b36332a0fab-8605aaf2c6e1 users 26M Jun 8 18:44 classification_8_narrowbanddrd.zip\r\n-rw------- 1 sd2d-634b36332a0fab-8605aaf2c6e1 users 26M Jun 8 18:44 classification_9_narrowbanddrd.zip\r\n-rw------- 1 sd2d-634b36332a0fab-8605aaf2c6e1 users 18M Jun 8 18:45 classification_10_narrowbanddrd.zip\r\n-rw------- 1 sd2d-634b36332a0fab-8605aaf2c6e1 users 26M Jun 8 18:45 testset_1_squiggle.zip\r\n-rw------- 1 sd2d-634b36332a0fab-8605aaf2c6e1 users 26M Jun 8 18:45 testset_2_squiggle.zip\r\n-rw------- 1 sd2d-634b36332a0fab-8605aaf2c6e1 users 26M Jun 8 18:45 testset_3_squiggle.zip\r\n-rw------- 1 sd2d-634b36332a0fab-8605aaf2c6e1 users 26M Jun 8 18:45 testset_4_squiggle.zip\r\n-rw------- 1 sd2d-634b36332a0fab-8605aaf2c6e1 users 26M Jun 8 18:45 testset_1_narrowband.zip\r\n-rw------- 1 sd2d-634b36332a0fab-8605aaf2c6e1 users 26M Jun 8 18:45 testset_2_narrowband.zip\r\n-rw------- 1 sd2d-634b36332a0fab-8605aaf2c6e1 users 26M Jun 8 18:45 testset_3_narrowband.zip\r\n-rw------- 1 sd2d-634b36332a0fab-8605aaf2c6e1 users 26M Jun 8 18:45 testset_4_narrowband.zip\r\n-rw------- 1 sd2d-634b36332a0fab-8605aaf2c6e1 users 2.3M Jun 8 18:45 testset_5_narrowband.zip\r\n-rw------- 1 sd2d-634b36332a0fab-8605aaf2c6e1 users 26M Jun 8 18:45 testset_1_noise.zip\r\n-rw------- 1 sd2d-634b36332a0fab-8605aaf2c6e1 users 26M Jun 8 18:45 testset_2_noise.zip\r\n-rw------- 1 sd2d-634b36332a0fab-8605aaf2c6e1 users 26M Jun 8 18:46 testset_3_noise.zip\r\n-rw------- 1 sd2d-634b36332a0fab-8605aaf2c6e1 users 22M Jun 8 18:46 testset_4_noise.zip\r\n-rw------- 1 sd2d-634b36332a0fab-8605aaf2c6e1 users 26M Jun 8 18:46 testset_1_narrowbanddrd.zip\r\n-rw------- 1 sd2d-634b36332a0fab-8605aaf2c6e1 users 26M Jun 8 18:46 testset_2_narrowbanddrd.zip\r\n-rw------- 1 sd2d-634b36332a0fab-8605aaf2c6e1 users 26M Jun 8 18:46 testset_3_narrowbanddrd.zip\r\n-rw------- 1 sd2d-634b36332a0fab-8605aaf2c6e1 users 26M Jun 8 18:46 testset_4_narrowbanddrd.zip\r\ndrwx------ 2 sd2d-634b36332a0fab-8605aaf2c6e1 users 4.0K Jun 8 18:46 .\r\n-rw------- 1 sd2d-634b36332a0fab-8605aaf2c6e1 users 4.0M Jun 8 18:46 testset_5_narrowbanddrd.zip\r\n" } ], "source": "!ls -alrth $mydatafolder/zipfiles" } ], "metadata": { "kernelspec": { "display_name": "Python 2 with Spark 1.6", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.11" } }, "nbformat": 4, "nbformat_minor": 0 }