# divide data into 10 buckets import random def buckets(filename, bucketName, separator, classColumn): """the original data is in the file named filename bucketName is the prefix for all the bucket names separator is the character that divides the columns (for ex., a tab or comma and classColumn is the column that indicates the class""" # put the data in 10 buckets numberOfBuckets = 10 data = {} # first read in the data and divide by category with open(filename) as f: lines = f.readlines() for line in lines: if separator != '\t': line = line.replace(separator, '\t') # first get the category category = line.split()[classColumn] data.setdefault(category, []) data[category].append(line) # initialize the buckets buckets = [] for i in range(numberOfBuckets): buckets.append([]) # now for each category put the data into the buckets for k in data.keys(): #randomize order of instances for each class random.shuffle(data[k]) bNum = 0 # divide into buckets for item in data[k]: buckets[bNum].append(item) bNum = (bNum + 1) % numberOfBuckets # write to file for bNum in range(numberOfBuckets): f = open("%s-%02i" % (bucketName, bNum + 1), 'w') for item in buckets[bNum]: f.write(item) f.close() # example of how to use this code buckets("pimaSmall.txt", 'pimaSmall',',',8)