Creating DataBlock from Numpy Array

fastai
numpy
datablock
Published

June 20, 2024

Feed list of dict into DataBlock

TLDR; Need to prepare our data into list of dictionaries for each sample, eg L([{x: feature1, y: label1}, {x: feature2, y: label2}, {x: featuren, y: labeln} ]) format and feed the function into get_image param of DataBlock.


import numpy as np
import pandas as pd
import torch
from fastai.data.core import Datasets
from fastai.vision.all import *

Load data

# ref: https://www.kaggle.com/code/drkaggle22/digit-recognizer-solution-99-accuracy?scriptVersionId=181451739&cellId=3
import struct

def read_idx(filename):
    with open(filename, 'rb') as f:
        zero, data_type, dims= struct.unpack('>HBB', f.read(4))
        shape = tuple(struct.unpack('>I', f.read(4))[0] for d in range(dims))
        
        return np.frombuffer(f.read(), dtype=np.uint8).reshape(shape)

def load_mnist(image_path, label_path):
    images = read_idx(image_path)
    labels = read_idx(label_path)
    return images, labels

train_image_path = '/kaggle/input/mnist-dataset/train-images-idx3-ubyte/train-images-idx3-ubyte'
train_label_path = '/kaggle/input/mnist-dataset/train-labels-idx1-ubyte/train-labels-idx1-ubyte'
test_image_path =  '/kaggle/input/mnist-dataset/t10k-images-idx3-ubyte/t10k-images-idx3-ubyte'
test_label_path =  '/kaggle/input/mnist-dataset/t10k-labels-idx1-ubyte/t10k-labels-idx1-ubyte'
train_images, train_labels = load_mnist(train_image_path, train_label_path)
test_images, test_labels = load_mnist(test_image_path, test_label_path)
print(f'Train images shape: {train_images.shape}')
print(f'Train labels shape: {train_labels.shape}')
print(f'Test images shape: {test_images.shape}')
print(f'Test labels shape: {test_labels.shape}')
Train images shape: (60000, 28, 28)
Train labels shape: (60000,)
Test images shape: (10000, 28, 28)
Test labels shape: (10000,)
from collections import Counter
print(Counter(train_labels))
n_classes = len(Counter(train_labels))
print('n_classes:', n_classes)
Counter({1: 6742, 7: 6265, 3: 6131, 2: 5958, 9: 5949, 0: 5923, 6: 5918, 8: 5851, 4: 5842, 5: 5421})
n_classes: 10

def tensor_to_labelled_pil_image(tensor: np.ndarray, labels=None) -> list:
    ''' ref: https://www.kaggle.com/code/pemtaira/digit-recognizer-fastai-v2-2020
    shape image shape (total sample, height, width) into (total sample, 3, height, width),
    save into dictionary (x: reshaped img, y: label). Append dictionary to list. return list.
    '''
    reshaped = tensor.reshape(-1, 28, 28) #  (total sample, 28, 28) --> (total sample, 28, 28)
    reshaped = np.stack((reshaped,) *3, axis = 1) # (total sample, 28, 28) --> (total sample, 3, 28, 28)
    image_arr = []
    
    # loop each reshaped images, convert to float tensor, convert to PILImage, save as dictionary, append to list
    for idx, current_image in enumerate(reshaped):
        img = torch.tensor(current_image, dtype=torch.float) / 255.
        img = PILImage(to_image(img))
        
        final_data = None

        if (labels is None):
            final_data = {'x': img, 'y': None}
        else:
            final_data = {'x': img, 'y': labels[idx]}

        image_arr.append(final_data)

    return image_arr


def get_image(l:list) -> L:
    """
    returns list of [{'x': feature tensor, 'y': class label},
                    {...}, {...} ]
    L is fastai's implementation of list
    """
    features = l[0]
    labels = l[1]
    all_imgs = tensor_to_labelled_pil_image(features, labels)
    return L(all_imgs)
    
def get_y_fromdict(item):
    """get y from each sample dictionary returned from get_image()"""
    return item['y']

def get_x_fromdict(item):
    """get x from each sample dictionary returned from get_image()"""
    return item['x']

Initialize DataBlock

blocks=(ImageBlock(cls=PILImage), CategoryBlock) > Here we specify that our input data is an image and of class PILImage, our label is categorical

get_items=get_image > Function where we return list of {x:features, y:label} dictionary for all our samples

splitter=RandomSplitter(valid_pct=0.2, seed=42) > Describe how we want to split our data; Here we want to split train and test data into 80-20 split randomly. We specify seed to have reproducible result for each run.

get_x=get_x_fromdict > Function to extract features from list returned from get_image() function. Note, we could also use lambda function here get_x = (lambda item: item['x']).

get_y=get_y_fromdict > Function to extract label from list returned from get_image() function. Note, we could also use lambda function here get_y = (lambda item: item['y']).

Note that if we use lambda function when initializing DataBlock, we might need to use dill library to export our model.

mnist_db = DataBlock(
    blocks=(ImageBlock(cls=PILImage), CategoryBlock), 
    get_items=get_image, 
    splitter=RandomSplitter(valid_pct=0.2, seed=42),
    get_y=get_y_fromdict,#(lambda item: item['y']),
    get_x=get_x_fromdict #(lambda item: item['x'])
)
# checking featues and labels shapes
print(train_images.shape)
print(train_labels.shape)
(60000, 28, 28)
(60000,)
# stacking both train and test sets' features

print(train_images.shape)
print(test_images.shape)
np.vstack([train_images, test_images]).shape
(60000, 28, 28)
(10000, 28, 28)
(70000, 28, 28)
# stacking both train and test sets
print(train_labels.shape)
print(test_labels.shape)
np.hstack([train_labels, test_labels]).shape
(60000,)
(10000,)
(70000,)

This is how our data is reshaped in tensor_to_labelled_pil_image() function.

features = np.vstack([train_images, test_images])
features_reshaped = features.reshape(-1, 28, 28)
features_reshaped_stacked = np.stack((features_reshaped,) *3, axis = 1)

print('features.shape', features.shape)
print('features_reshaped.shape', features_reshaped.shape)
print('features_reshaped_stacked.shape', features_reshaped_stacked.shape)
features.shape (70000, 28, 28)
features_reshaped.shape (70000, 28, 28)
features_reshaped_stacked.shape (70000, 3, 28, 28)

Quick plot

# ref: https://stackoverflow.com/a/59296746
import matplotlib.pyplot as plt
fig, axes = plt.subplots(10,10, figsize=(28,28))
for i,ax in enumerate(axes.flat):
    ax.imshow(features_reshaped[i])

Load our source data

dls = mnist_db.dataloaders([np.vstack([train_images, test_images]),
                            np.hstack([train_labels, test_labels])])
dls.show_batch()

Train model

learn = vision_learner(dls, resnet18, metrics=[error_rate, accuracy])
learn.fine_tune(10)
Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /root/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth
100%|██████████| 44.7M/44.7M [00:00<00:00, 146MB/s]
epoch train_loss valid_loss error_rate accuracy time
0 0.714643 0.483854 0.155143 0.844857 04:06
90.00% [9/10 1:10:59<07:53]
epoch train_loss valid_loss error_rate accuracy time
0 0.170800 0.091101 0.026429 0.973571 08:18
1 0.098211 0.057546 0.018071 0.981929 07:50
2 0.070756 0.043570 0.013071 0.986929 07:54
3 0.045105 0.036998 0.010214 0.989786 07:51
4 0.034318 0.037484 0.010214 0.989786 07:51
5 0.032253 0.031844 0.007857 0.992143 07:49
6 0.013959 0.029695 0.006714 0.993286 07:47
7 0.006643 0.028861 0.006643 0.993357 07:48
8 0.002887 0.027575 0.006143 0.993857 07:48

22.83% [50/219 00:06<00:21 0.0014]
IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

Save model

learn.export('model2.pkl')

If we use lambda function when initializing DataBlock, we can use dill to save model. Eg:


import dill
learn.export('model2.pkl', pickle_module=dill)