## Installing `fastai` and version info:

In [None]:
pip install fastai --quiet
pip show fastai fastcore

Name: fastai
Version: 0.0.17
Summary: Version 2 of the fastai library
Home-page: https://github.com/fastai/fastai
Author: Jeremy Howard, Sylvain Gugger, and contributors
Author-email: info@fast.ai
License: Apache Software License 2.0
Location: /usr/local/lib/python3.6/dist-packages
Requires: requests, pillow, torchvision, pandas, spacy, pyyaml, fastprogress, scipy, scikit-learn, fastcore, torch, matplotlib
Required-by: 
---
Name: fastcore
Version: 0.1.17
Summary: Python supercharged for fastai development
Home-page: https://github.com/fastai/fastcore
Author: Jeremy Howard and Sylvain Gugger
Author-email: infos@fast.ai
License: Apache Software License 2.0
Location: /usr/local/lib/python3.6/dist-packages
Requires: numpy, dataclasses
Required-by: fastai


## Library Imports & Pre-Process

In [None]:
from fastai.tabular.all import *

In [None]:
path = untar_data(URLs.ADULT_SAMPLE)
df = pd.read_csv(path/'adult.csv')

In [None]:
cat_names = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race']
cont_names = ['age', 'fnlwgt', 'education-num']
procs = [Categorify, FillMissing, Normalize]
y_names = 'salary'
splits = RandomSplitter()(range_of(df))

## `TabularPandas`

In [None]:
to = TabularPandas(df, procs=procs, cat_names=cat_names, cont_names=cont_names,
                   y_names=y_names, splits=splits)

Raw x's and y's:

In [None]:
to.train.xs.iloc[:3]

Unnamed: 0,workclass,education,marital-status,occupation,relationship,race,education-num_na,age,fnlwgt,education-num
27147,5,8,5,14,2,5,1,-0.631822,-0.972539,0.75185
20557,5,12,1,11,2,5,1,-0.705017,-1.500515,-0.424423
5537,5,10,3,13,1,5,1,0.026942,-0.122164,1.14394


In [None]:
to.train.ys.iloc[:3]

Unnamed: 0,salary
23736,0
24771,0
6144,0


## Utility Functions

In [None]:
from IPython.utils import io as io_p

In [None]:
def get_b_w(t):
    best = round(t.best*1000, 2)
    worst = round(t.worst*1000, 2)
    return best, worst

In [None]:
def get_avg(a, b, dl):
    best = round(a/len(dl), 2)
    worst = round(b/len(dl), 2)
    return best, worst

# Standard `fastai` DataLoader

In [None]:
dls = to.dataloaders(bs=128, device='cpu')

## CPU

### Hidden

In [None]:
# We're going to redirect where print goes to
old_stdout = sys.stdout
new_stdout = io.StringIO()
sys.stdout = new_stdout
print(f'Type = NumPy\nDevice = {dls.device}\nBatch Size = {dls.bs}') # Print device and batch size
with io_p.capture_output() as captured: # Hide %timeit output
    t = %timeit -o next(iter(dls.train)) # Time getting first batch
    best, worst = get_b_w(t) # Round
print(f'First Batch:\n\t`train`: Best: {best}ms, Worst: {worst}ms')
with io_p.capture_output() as captured:
    t = %timeit -o next(iter(dls.valid))
    best, worst = get_b_w(t)
print(f'\t`valid`: Best: {best}ms, Worst: {worst}ms')
with io_p.capture_output() as captured:
    t = %timeit -o for _ in dls.train: pass # Time going over all batches
    best, worst = get_b_w(t)
print(f'All Batches:\n\t`train`: Best: {best}ms, Worst: {worst}ms')
b_t,w_t = get_avg(best, worst, dls.train)
with io_p.capture_output() as captured:
    t = %timeit -o for _ in dls.valid: pass
    best, worst = get_b_w(t)
print(f'\t`valid`: Best: {best}ms, Worst: {worst}ms')
b_v,w_v = get_avg(best, worst, dls.valid)
print(f'Average Per Batch:\n\t`train`: Best: {b_t}ms/batch, Worst: {w_t}ms/batch')
print(f'\t`valid`: Best: {b_v}ms/batch, Worst: {w_v}ms/batch')
out = new_stdout.getvalue()
sys.stdout = old_stdout

### Show

In [None]:
print(out)

Type = NumPy
Device = cpu
Batch Size = 128
First Batch:
	`train`: Best: 19.43ms, Worst: 20.96ms
	`valid`: Best: 3.59ms, Worst: 3.69ms
All Batches:
	`train`: Best: 703.5ms, Worst: 726.88ms
	`valid`: Best: 170.05ms, Worst: 176.59ms
Average Per Batch:
	`train`: Best: 3.47ms/batch, Worst: 3.58ms/batch
	`valid`: Best: 3.33ms/batch, Worst: 3.46ms/batch



## CUDA

In [None]:
dls.device = 'cuda'

### Hidden

In [None]:
# We're going to redirect where print goes to
old_stdout = sys.stdout
new_stdout = io.StringIO()
sys.stdout = new_stdout
print(f'Type = fastai\nDevice = {dls.device}\nBatch Size = {dls.bs}') # Print device and batch size
with io_p.capture_output() as captured: # Hide %timeit output
    t = %timeit -o next(iter(dls.train)) # Time getting first batch
    best, worst = get_b_w(t) # Round
print(f'First Batch:\n\t`train`: Best: {best}ms, Worst: {worst}ms')
with io_p.capture_output() as captured:
    t = %timeit -o next(iter(dls.valid))
    best, worst = get_b_w(t)
print(f'\t`valid`: Best: {best}ms, Worst: {worst}ms')
with io_p.capture_output() as captured:
    t = %timeit -o for _ in dls.train: pass # Time going over all batches
    best, worst = get_b_w(t)
print(f'All Batches:\n\t`train`: Best: {best}ms, Worst: {worst}ms')
b_t,w_t = get_avg(best, worst, dls.train)
with io_p.capture_output() as captured:
    t = %timeit -o for _ in dls.valid: pass
    best, worst = get_b_w(t)
print(f'\t`valid`: Best: {best}ms, Worst: {worst}ms')
b_v,w_v = get_avg(best, worst, dls.valid)
print(f'Average Per Batch:\n\t`train`: Best: {b_t}ms/batch, Worst: {w_t}ms/batch')
print(f'\t`valid`: Best: {b_v}ms/batch, Worst: {w_v}ms/batch')
out = new_stdout.getvalue()
sys.stdout = old_stdout

### Show

In [None]:
print(out)

Type = fastai
Device = cuda
Batch Size = 128
First Batch:
	`train`: Best: 20.0ms, Worst: 2852.74ms
	`valid`: Best: 3.6ms, Worst: 4.01ms
All Batches:
	`train`: Best: 711.13ms, Worst: 723.7ms
	`valid`: Best: 174.96ms, Worst: 180.12ms
Average Per Batch:
	`train`: Best: 3.5ms/batch, Worst: 3.57ms/batch
	`valid`: Best: 3.43ms/batch, Worst: 3.53ms/batch



# Building a Custom `DL`

In [None]:
class TabDataset():
    "A `NumPy` dataset from a `TabularPandas` object"
    def __init__(self, to):
        self.cats = to.cats.to_numpy().astype(np.long)
        self.conts = to.conts.to_numpy().astype(np.float32)
        self.ys = to.ys.to_numpy()

    def __getitem__(self, idx):
        idx = idx[0]
        return self.cats[idx:idx+self.bs], self.conts[idx:idx+self.bs], self.ys[idx:idx+self.bs]

    def __len__(self): return len(self.cats)

In [None]:
train_ds = TabDataset(to.train)
valid_ds = TabDataset(to.valid)

In [None]:
class TabDataLoader(DataLoader):
    def __init__(self, dataset, bs=1, num_workers=0, device='cuda', shuffle=False, **kwargs):
        "A `DataLoader` based on a `TabDataset`"
        super().__init__(dataset, bs=bs, num_workers=num_workers, shuffle=shuffle, 
                         device=device, drop_last=shuffle, **kwargs)
        self.dataset.bs=bs
    
    def create_item(self, s): return s

    def create_batch(self, b):
        cat, cont, y = self.dataset[b]
        return tensor(cat).to(self.device), tensor(cont).to(self.device), tensor(y).to(self.device)

In [None]:
train_dl = TabDataLoader(train_ds, bs=128, shuffle=False)
valid_dl = TabDataLoader(train_ds, bs=128, shuffle=False)

## CPU

In [None]:
dls = DataLoaders(train_dl, valid_dl, device='cpu')

### Hidden

In [None]:
# We're going to redirect where print goes to
old_stdout = sys.stdout
new_stdout = io.StringIO()
sys.stdout = new_stdout
print(f'Type = NumPy\nDevice = {dls.device}\nBatch Size = {dls.bs}') # Print device and batch size
with io_p.capture_output() as captured: # Hide %timeit output
    t = %timeit -o next(iter(dls.train)) # Time getting first batch
    best, worst = get_b_w(t) # Round
print(f'First Batch:\n\t`train`: Best: {best}ms, Worst: {worst}ms')
with io_p.capture_output() as captured:
    t = %timeit -o next(iter(dls.valid))
    best, worst = get_b_w(t)
print(f'\t`valid`: Best: {best}ms, Worst: {worst}ms')
with io_p.capture_output() as captured:
    t = %timeit -o for _ in dls.train: pass # Time going over all batches
    best, worst = get_b_w(t)
print(f'All Batches:\n\t`train`: Best: {best}ms, Worst: {worst}ms')
b_t,w_t = get_avg(best, worst, dls.train)
with io_p.capture_output() as captured:
    t = %timeit -o for _ in dls.valid: pass
    best, worst = get_b_w(t)
print(f'\t`valid`: Best: {best}ms, Worst: {worst}ms')
b_v,w_v = get_avg(best, worst, dls.valid)
print(f'Average Per Batch:\n\t`train`: Best: {b_t}ms/batch, Worst: {w_t}ms/batch')
print(f'\t`valid`: Best: {b_v}ms/batch, Worst: {w_v}ms/batch')
out = new_stdout.getvalue()
sys.stdout = old_stdout

### Show

In [None]:
print(out)

Type = NumPy
Device = cpu
Batch Size = 128
First Batch:
	`train`: Best: 0.89ms, Worst: 1.34ms
	`valid`: Best: 0.9ms, Worst: 1.03ms
All Batches:
	`train`: Best: 31.86ms, Worst: 34.0ms
	`valid`: Best: 32.57ms, Worst: 40.9ms
Average Per Batch:
	`train`: Best: 0.16ms/batch, Worst: 0.17ms/batch
	`valid`: Best: 0.16ms/batch, Worst: 0.2ms/batch



## CUDA

In [None]:
dls.device = 'cuda'

### Hidden

In [None]:
# We're going to redirect where print goes to
old_stdout = sys.stdout
new_stdout = io.StringIO()
sys.stdout = new_stdout
print(f'Type = NumPy\nDevice = {dls.device}\nBatch Size = {dls.bs}') # Print device and batch size
with io_p.capture_output() as captured: # Hide %timeit output
    t = %timeit -o next(iter(dls.train)) # Time getting first batch
    best, worst = get_b_w(t) # Round
print(f'First Batch:\n\t`train`: Best: {best}ms, Worst: {worst}ms')
with io_p.capture_output() as captured:
    t = %timeit -o next(iter(dls.valid))
    best, worst = get_b_w(t)
print(f'\t`valid`: Best: {best}ms, Worst: {worst}ms')
with io_p.capture_output() as captured:
    t = %timeit -o for _ in dls.train: pass # Time going over all batches
    best, worst = get_b_w(t)
print(f'All Batches:\n\t`train`: Best: {best}ms, Worst: {worst}ms')
b_t,w_t = get_avg(best, worst, dls.train)
with io_p.capture_output() as captured:
    t = %timeit -o for _ in dls.valid: pass
    best, worst = get_b_w(t)
print(f'\t`valid`: Best: {best}ms, Worst: {worst}ms')
b_v,w_v = get_avg(best, worst, dls.valid)
print(f'Average Per Batch:\n\t`train`: Best: {b_t}ms/batch, Worst: {w_t}ms/batch')
print(f'\t`valid`: Best: {b_v}ms/batch, Worst: {w_v}ms/batch')
out = new_stdout.getvalue()
sys.stdout = old_stdout

## Show

In [None]:
print(out)

Type = NumPy
Device = cuda
Batch Size = 128
First Batch:
	`train`: Best: 1.03ms, Worst: 1.7ms
	`valid`: Best: 1.03ms, Worst: 1.13ms
All Batches:
	`train`: Best: 52.53ms, Worst: 54.39ms
	`valid`: Best: 53.38ms, Worst: 60.92ms
Average Per Batch:
	`train`: Best: 0.26ms/batch, Worst: 0.27ms/batch
	`valid`: Best: 0.26ms/batch, Worst: 0.3ms/batch



# The `shuffle_fn`

In [None]:
# Don't run
def shuffle_fn(self, idxs): return self.rng.sample(idxs, len(idxs))
def randomize(self): self.rng = random.Random(self.rng.randint(0,2**32-1))

In [None]:
@patch
def shuffle_fn(x:TabDataLoader):
    "Shuffle the interior dataset"
    rng = np.random.permutation(len(x.dataset))
    x.dataset.cats = x.dataset.cats[rng]
    x.dataset.conts = x.dataset.conts[rng]
    x.dataset.ys = x.dataset.ys[rng]

# `get_idxs`

In [None]:
# Don't run
def get_idxs(self):
    idxs = Inf.count if self.indexed else Inf.nones
    if self.n is not None: idxs = list(itertools.islice(idxs, self.n))
    if self.shuffle: idxs = self.shuffle_fn(idxs)
    return idxs

In [None]:
@patch
def get_idxs(x:TabDataLoader):
    "Get index's to select"
    idxs = Inf.count if x.indexed else Inf.nones
    if x.n is not None: idxs = list(range(len(x.dataset)))
    if x.shuffle: x.shuffle_fn()
    return idxs

# Final Timings

In [None]:
train_dl = TabDataLoader(train_ds, shuffle=True, bs=128)
valid_dl = TabDataLoader(valid_ds, bs=128)
dls = DataLoaders(train_dl, valid_dl, device='cpu')

## CPU

### Hidden

In [None]:
# We're going to redirect where print goes to
old_stdout = sys.stdout
new_stdout = io.StringIO()
sys.stdout = new_stdout
print(f'Type = NumPy\nDevice = {dls.device}\nBatch Size = {dls.bs}') # Print device and batch size
with io_p.capture_output() as captured: # Hide %timeit output
    t = %timeit -o next(iter(dls.train)) # Time getting first batch
    best, worst = get_b_w(t) # Round
print(f'First Batch:\n\t`train`: Best: {best}ms, Worst: {worst}ms')
with io_p.capture_output() as captured:
    t = %timeit -o next(iter(dls.valid))
    best, worst = get_b_w(t)
print(f'\t`valid`: Best: {best}ms, Worst: {worst}ms')
with io_p.capture_output() as captured:
    t = %timeit -o for _ in dls.train: pass # Time going over all batches
    best, worst = get_b_w(t)
print(f'All Batches:\n\t`train`: Best: {best}ms, Worst: {worst}ms')
b_t,w_t = get_avg(best, worst, dls.train)
with io_p.capture_output() as captured:
    t = %timeit -o for _ in dls.valid: pass
    best, worst = get_b_w(t)
print(f'\t`valid`: Best: {best}ms, Worst: {worst}ms')
b_v,w_v = get_avg(best, worst, dls.valid)
print(f'Average Per Batch:\n\t`train`: Best: {b_t}ms/batch, Worst: {w_t}ms/batch')
print(f'\t`valid`: Best: {b_v}ms/batch, Worst: {w_v}ms/batch')
out = new_stdout.getvalue()
sys.stdout = old_stdout

In [None]:
# We're going to redirect where print goes to
old_stdout = sys.stdout
new_stdout = io.StringIO()
sys.stdout = new_stdout
dls_f = to.dataloaders(bs=128, device='cpu')
print(f'Type = fastai\nDevice = {dls_f.device}\nBatch Size = {dls_f.bs}') # Print device and batch size
with io_p.capture_output() as captured: # Hide %timeit output
    t = %timeit -o next(iter(dls_f.train)) # Time getting first batch
    best, worst = get_b_w(t) # Round
print(f'First Batch:\n\t`train`: Best: {best}ms, Worst: {worst}ms')
with io_p.capture_output() as captured:
    t = %timeit -o next(iter(dls_f.valid))
    best, worst = get_b_w(t)
print(f'\t`valid`: Best: {best}ms, Worst: {worst}ms')
with io_p.capture_output() as captured:
    t = %timeit -o for _ in dls_f.train: pass # Time going over all batches
    best, worst = get_b_w(t)
print(f'All Batches:\n\t`train`: Best: {best}ms, Worst: {worst}ms')
b_t,w_t = get_avg(best, worst, dls_f.train)
with io_p.capture_output() as captured:
    t = %timeit -o for _ in dls_f.valid: pass
    best, worst = get_b_w(t)
print(f'\t`valid`: Best: {best}ms, Worst: {worst}ms')
b_v,w_v = get_avg(best, worst, dls_f.valid)
print(f'Average Per Batch:\n\t`train`: Best: {b_t}ms/batch, Worst: {w_t}ms/batch')
print(f'\t`valid`: Best: {b_v}ms/batch, Worst: {w_v}ms/batch')
out2 = new_stdout.getvalue()
sys.stdout = old_stdout

### Show

In [None]:
print(out)

Type = NumPy
Device = cpu
Batch Size = 128
First Batch:
	`train`: Best: 2.4ms, Worst: 4.49ms
	`valid`: Best: 0.3ms, Worst: 0.44ms
All Batches:
	`train`: Best: 33.32ms, Worst: 39.55ms
	`valid`: Best: 7.84ms, Worst: 8.0ms
Average Per Batch:
	`train`: Best: 0.16ms/batch, Worst: 0.19ms/batch
	`valid`: Best: 0.15ms/batch, Worst: 0.16ms/batch



In [None]:
print(out2)

Type = fastai
Device = cpu
Batch Size = 128
First Batch:
	`train`: Best: 18.73ms, Worst: 22.48ms
	`valid`: Best: 3.51ms, Worst: 3.67ms
All Batches:
	`train`: Best: 683.69ms, Worst: 693.98ms
	`valid`: Best: 163.9ms, Worst: 178.87ms
Average Per Batch:
	`train`: Best: 3.37ms/batch, Worst: 3.42ms/batch
	`valid`: Best: 3.21ms/batch, Worst: 3.51ms/batch



### Hidden

In [None]:
dls.device = 'cuda'
# We're going to redirect where print goes to
old_stdout = sys.stdout
new_stdout = io.StringIO()
sys.stdout = new_stdout
print(f'Type = NumPy\nDevice = {dls.device}\nBatch Size = {dls.bs}') # Print device and batch size
with io_p.capture_output() as captured: # Hide %timeit output
    t = %timeit -o next(iter(dls.train)) # Time getting first batch
    best, worst = get_b_w(t) # Round
print(f'First Batch:\n\t`train`: Best: {best}ms, Worst: {worst}ms')
with io_p.capture_output() as captured:
    t = %timeit -o next(iter(dls.valid))
    best, worst = get_b_w(t)
print(f'\t`valid`: Best: {best}ms, Worst: {worst}ms')
with io_p.capture_output() as captured:
    t = %timeit -o for _ in dls.train: pass # Time going over all batches
    best, worst = get_b_w(t)
print(f'All Batches:\n\t`train`: Best: {best}ms, Worst: {worst}ms')
b_t,w_t = get_avg(best, worst, dls.train)
with io_p.capture_output() as captured:
    t = %timeit -o for _ in dls.valid: pass
    best, worst = get_b_w(t)
print(f'\t`valid`: Best: {best}ms, Worst: {worst}ms')
b_v,w_v = get_avg(best, worst, dls.valid)
print(f'Average Per Batch:\n\t`train`: Best: {b_t}ms/batch, Worst: {w_t}ms/batch')
print(f'\t`valid`: Best: {b_v}ms/batch, Worst: {w_v}ms/batch')
out = new_stdout.getvalue()
sys.stdout = old_stdout

In [None]:
# We're going to redirect where print goes to
old_stdout = sys.stdout
new_stdout = io.StringIO()
sys.stdout = new_stdout
dls_f.device = 'cuda'
print(f'Type = fastai\nDevice = {dls_f.device}\nBatch Size = {dls_f.bs}') # Print device and batch size
with io_p.capture_output() as captured: # Hide %timeit output
    t = %timeit -o next(iter(dls_f.train)) # Time getting first batch
    best, worst = get_b_w(t) # Round
print(f'First Batch:\n\t`train`: Best: {best}ms, Worst: {worst}ms')
with io_p.capture_output() as captured:
    t = %timeit -o next(iter(dls_f.valid))
    best, worst = get_b_w(t)
print(f'\t`valid`: Best: {best}ms, Worst: {worst}ms')
with io_p.capture_output() as captured:
    t = %timeit -o for _ in dls_f.train: pass # Time going over all batches
    best, worst = get_b_w(t)
print(f'All Batches:\n\t`train`: Best: {best}ms, Worst: {worst}ms')
b_t,w_t = get_avg(best, worst, dls_f.train)
with io_p.capture_output() as captured:
    t = %timeit -o for _ in dls_f.valid: pass
    best, worst = get_b_w(t)
print(f'\t`valid`: Best: {best}ms, Worst: {worst}ms')
b_v,w_v = get_avg(best, worst, dls_f.valid)
print(f'Average Per Batch:\n\t`train`: Best: {b_t}ms/batch, Worst: {w_t}ms/batch')
print(f'\t`valid`: Best: {b_v}ms/batch, Worst: {w_v}ms/batch')
out2 = new_stdout.getvalue()
sys.stdout = old_stdout

### Show

In [None]:
print(out)

Type = NumPy
Device = cuda
Batch Size = 128
First Batch:
	`train`: Best: 2.5ms, Worst: 9.24ms
	`valid`: Best: 0.44ms, Worst: 0.62ms
All Batches:
	`train`: Best: 48.83ms, Worst: 49.51ms
	`valid`: Best: 13.04ms, Worst: 13.57ms
Average Per Batch:
	`train`: Best: 0.24ms/batch, Worst: 0.24ms/batch
	`valid`: Best: 0.26ms/batch, Worst: 0.27ms/batch



In [None]:
print(out2)

Type = fastai
Device = cuda
Batch Size = 128
First Batch:
	`train`: Best: 19.31ms, Worst: 22.86ms
	`valid`: Best: 3.57ms, Worst: 3.75ms
All Batches:
	`train`: Best: 713.25ms, Worst: 745.08ms
	`valid`: Best: 170.11ms, Worst: 179.02ms
Average Per Batch:
	`train`: Best: 3.51ms/batch, Worst: 3.67ms/batch
	`valid`: Best: 3.34ms/batch, Worst: 3.51ms/batch

