The objective of this assignment is to learn about simple data curation practices, and familiarize you with some of the data we’ll be reusing later.
This notebook uses the notMNIST dataset to be used with python experiments. This dataset is designed to look like the classic MNIST dataset, while looking a little more like real data: it’s a harder task, and the data is a lot less ‘clean’ than MNIST.
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
from __future__ import print_function
import matplotlib.pyplot as plt
import numpy as np
import os
import sys
import tarfile
from IPython.display import display, Image
from scipy import ndimage
from sklearn.linear_model import LogisticRegression
from six.moves.urllib.request import urlretrieve
from six.moves import cPickle as pickle
# Config the matplotlib backend as plotting inline in IPython
%matplotlib inline
First, we’ll download the dataset to our local machine. The data consists of characters rendered in a variety of fonts on a 28x28 image. The labels are limited to ‘A’ through ‘J’ (10 classes). The training set has about 500k and the testset 19000 labeled examples. Given these sizes, it should be possible to train models quickly on any machine.
url = 'https://commondatastorage.googleapis.com/books1000/'
last_percent_reported = None
data_root = '.' # Change me to store data elsewhere
def download_progress_hook(count, blockSize, totalSize):
"""A hook to report the progress of a download. This is mostly intended for users with
slow internet connections. Reports every 5% change in download progress.
"""
global last_percent_reported
percent = int(count * blockSize * 100 / totalSize)
if last_percent_reported != percent:
if percent % 5 == 0:
sys.stdout.write("%s%%" % percent)
sys.stdout.flush()
else:
sys.stdout.write(".")
sys.stdout.flush()
last_percent_reported = percent
def maybe_download(filename, expected_bytes, force=False):
"""Download a file if not present, and make sure it's the right size."""
dest_filename = os.path.join(data_root, filename)
if force or not os.path.exists(dest_filename):
print('Attempting to download:', filename)
filename, _ = urlretrieve(url + filename, dest_filename, reporthook=download_progress_hook)
print('\nDownload Complete!')
statinfo = os.stat(dest_filename)
if statinfo.st_size == expected_bytes:
print('Found and verified', dest_filename)
else:
raise Exception(
'Failed to verify ' + dest_filename + '. Can you get to it with a browser?')
return dest_filename
train_filename = maybe_download('notMNIST_large.tar.gz', 247336696)
test_filename = maybe_download('notMNIST_small.tar.gz', 8458043)
Found and verified ./notMNIST_large.tar.gz
Found and verified ./notMNIST_small.tar.gz
Extract the dataset from the compressed .tar.gz file. This should give you a set of directories, labeled A through J.
num_classes = 10
np.random.seed(133)
def maybe_extract(filename, force=False):
root = os.path.splitext(os.path.splitext(filename)[0])[0] # remove .tar.gz
if os.path.isdir(root) and not force:
# You may override by setting force=True.
print('%s already present - Skipping extraction of %s.' % (root, filename))
else:
print('Extracting data for %s. This may take a while. Please wait.' % root)
tar = tarfile.open(filename)
sys.stdout.flush()
tar.extractall(data_root)
tar.close()
data_folders = [
os.path.join(root, d) for d in sorted(os.listdir(root))
if os.path.isdir(os.path.join(root, d))]
if len(data_folders) != num_classes:
raise Exception(
'Expected %d folders, one per class. Found %d instead.' % (
num_classes, len(data_folders)))
print(data_folders)
return data_folders
train_folders = maybe_extract(train_filename)
test_folders = maybe_extract(test_filename)
./notMNIST_large already present - Skipping extraction of ./notMNIST_large.tar.gz.
['./notMNIST_large/A', './notMNIST_large/B', './notMNIST_large/C', './notMNIST_large/D', './notMNIST_large/E', './notMNIST_large/F', './notMNIST_large/G', './notMNIST_large/H', './notMNIST_large/I', './notMNIST_large/J']
./notMNIST_small already present - Skipping extraction of ./notMNIST_small.tar.gz.
['./notMNIST_small/A', './notMNIST_small/B', './notMNIST_small/C', './notMNIST_small/D', './notMNIST_small/E', './notMNIST_small/F', './notMNIST_small/G', './notMNIST_small/H', './notMNIST_small/I', './notMNIST_small/J']
Problem 1 ———
Let’s take a peek at some of the data to make sure it looks sensible. Each exemplar should be an image of a character A through J rendered in a different font. Display a sample of the images that we just downloaded. Hint: you can use the package IPython.display.
png_list = os.listdir(test_folders[0])
for i in range(5) :
for j in range(10) :
display(Image(test_folders[i]+'/'+png_list[j*100]))
Now let’s load the data in a more manageable format. Since, depending on your computer setup you might not be able to fit it all in memory, we’ll load each class into a separate dataset, store them on disk and curate them independently. Later we’ll merge them into a single dataset of manageable size.
We’ll convert the entire dataset into a 3D array (image index, x, y) of floating point values, normalized to have approximately zero mean and standard deviation ~0.5 to make training easier down the road.
A few images might not be readable, we’ll just skip them.
image_size = 28 # Pixel width and height.
pixel_depth = 255.0 # Number of levels per pixel.
def load_letter(folder, min_num_images):
"""Load the data for a single letter label."""
image_files = os.listdir(folder)
dataset = np.ndarray(shape=(len(image_files), image_size, image_size),
dtype=np.float32)
print(folder)
num_images = 0
for image in image_files:
image_file = os.path.join(folder, image)
try:
image_data = (ndimage.imread(image_file).astype(float) -
pixel_depth / 2) / pixel_depth
if image_data.shape != (image_size, image_size):
raise Exception('Unexpected image shape: %s' % str(image_data.shape))
dataset[num_images, :, :] = image_data
num_images = num_images + 1
except IOError as e:
print('Could not read:', image_file, ':', e, '- it\'s ok, skipping.')
dataset = dataset[0:num_images, :, :]
if num_images < min_num_images:
raise Exception('Many fewer images than expected: %d < %d' %
(num_images, min_num_images))
print('Full dataset tensor:', dataset.shape)
print('Mean:', np.mean(dataset))
print('Standard deviation:', np.std(dataset))
return dataset
def maybe_pickle(data_folders, min_num_images_per_class, force=False):
dataset_names = []
for folder in data_folders:
set_filename = folder + '.pickle'
dataset_names.append(set_filename)
if os.path.exists(set_filename) and not force:
# You may override by setting force=True.
print('%s already present - Skipping pickling.' % set_filename)
else:
print('Pickling %s.' % set_filename)
dataset = load_letter(folder, min_num_images_per_class)
try:
with open(set_filename, 'wb') as f:
pickle.dump(dataset, f, pickle.HIGHEST_PROTOCOL)
except Exception as e:
print('Unable to save data to', set_filename, ':', e)
return dataset_names
train_datasets = maybe_pickle(train_folders, 45000)
test_datasets = maybe_pickle(test_folders, 1800)
./notMNIST_large/A.pickle already present - Skipping pickling.
./notMNIST_large/B.pickle already present - Skipping pickling.
./notMNIST_large/C.pickle already present - Skipping pickling.
./notMNIST_large/D.pickle already present - Skipping pickling.
./notMNIST_large/E.pickle already present - Skipping pickling.
./notMNIST_large/F.pickle already present - Skipping pickling.
./notMNIST_large/G.pickle already present - Skipping pickling.
./notMNIST_large/H.pickle already present - Skipping pickling.
./notMNIST_large/I.pickle already present - Skipping pickling.
./notMNIST_large/J.pickle already present - Skipping pickling.
./notMNIST_small/A.pickle already present - Skipping pickling.
./notMNIST_small/B.pickle already present - Skipping pickling.
./notMNIST_small/C.pickle already present - Skipping pickling.
./notMNIST_small/D.pickle already present - Skipping pickling.
./notMNIST_small/E.pickle already present - Skipping pickling.
./notMNIST_small/F.pickle already present - Skipping pickling.
./notMNIST_small/G.pickle already present - Skipping pickling.
./notMNIST_small/H.pickle already present - Skipping pickling.
./notMNIST_small/I.pickle already present - Skipping pickling.
./notMNIST_small/J.pickle already present - Skipping pickling.
Problem 2 ———
Let’s verify that the data still looks good. Displaying a sample of the labels and images from the ndarray. Hint: you can use matplotlib.pyplot.
pro2_dataset = load_letter(test_folders[0], 10)
./notMNIST_small/A
Could not read: ./notMNIST_small/A/RGVtb2NyYXRpY2FCb2xkT2xkc3R5bGUgQm9sZC50dGY=.png : cannot identify image file './notMNIST_small/A/RGVtb2NyYXRpY2FCb2xkT2xkc3R5bGUgQm9sZC50dGY=.png' - it's ok, skipping.
Full dataset tensor: (1872, 28, 28)
Mean: -0.132626
Standard deviation: 0.445128
plt.imshow(pro2_dataset[0])
print(pro2_dataset[0])
[[-0.5 -0.5 -0.5 -0.5 -0.5 -0.5 -0.5
-0.5 -0.5 -0.5 -0.45294118 -0.45686275 -0.49607843
-0.5 -0.46470588 -0.48039216 -0.5 -0.49607843 -0.5 -0.5
-0.5 -0.5 -0.5 -0.5 -0.5 -0.5 -0.5
-0.5 ]
[-0.5 -0.5 -0.5 -0.5 -0.5 -0.5 -0.5
-0.5 -0.5 -0.5 -0.44117647 -0.40980393 -0.17843138
-0.04509804 -0.26078433 -0.1509804 -0.35882354 -0.5 -0.49607843
-0.5 -0.5 -0.5 -0.5 -0.5 -0.5 -0.5
-0.5 -0.5 ]
[-0.5 -0.5 -0.5 -0.5 -0.5 -0.5 -0.5
-0.5 -0.49215686 -0.5 -0.2254902 0.11960784 0.26862746
0.5 0.37058824 0.38627452 -0.01372549 -0.37843138 -0.49215686
-0.5 -0.5 -0.5 -0.5 -0.5 -0.5 -0.5
-0.5 -0.5 ]
[-0.5 -0.5 -0.5 -0.5 -0.5 -0.5 -0.5
-0.49215686 -0.5 -0.32745099 0.06862745 0.30000001 0.5
0.48039216 0.49215686 0.5 0.39411765 0.04509804 -0.5
-0.49607843 -0.5 -0.5 -0.5 -0.5 -0.5 -0.5
-0.5 -0.5 ]
[-0.5 -0.5 -0.5 -0.5 -0.5 -0.5 -0.5
-0.49215686 -0.5 -0.31960785 0.03333334 0.44509804 0.49607843
0.49607843 0.49215686 0.49607843 0.46078432 0.13137256 -0.3392157
-0.48823529 -0.5 -0.5 -0.5 -0.5 -0.5 -0.5
-0.5 -0.5 ]
[-0.5 -0.5 -0.5 -0.5 -0.5 -0.5 -0.5
-0.48039216 -0.5 -0.17058824 0.49607843 0.49607843 0.49607843
0.5 0.5 0.5 0.44117647 0.4137255 0.00588235
-0.5 -0.5 -0.49607843 -0.5 -0.5 -0.5 -0.5
-0.5 -0.5 ]
[-0.5 -0.5 -0.5 -0.5 -0.5 -0.5
-0.49215686 -0.5 -0.20196079 0.20980392 0.40588236 0.48823529
0.5 0.5 0.5 0.49607843 0.5 0.45686275
0.01764706 -0.37843138 -0.5 -0.49607843 -0.5 -0.5 -0.5
-0.5 -0.5 -0.5 ]
[-0.5 -0.5 -0.5 -0.5 -0.5 -0.5
-0.48823529 -0.5 0.00980392 0.20588236 0.45686275 0.5
0.48823529 0.48823529 0.5 0.49215686 0.5 0.35882354
0.36666667 -0.26862746 -0.5 -0.48823529 -0.5 -0.5 -0.5
-0.5 -0.5 -0.5 ]
[-0.5 -0.5 -0.5 -0.5 -0.5 -0.5
-0.48431373 -0.45294118 0.04117647 0.36666667 0.5 0.49607843
0.49215686 0.47254902 0.4254902 0.49215686 0.49607843 0.5
0.42156863 -0.06078431 -0.5 -0.49215686 -0.5 -0.5 -0.5
-0.5 -0.5 -0.5 ]
[-0.5 -0.5 -0.5 -0.5 -0.5 -0.49607843
-0.5 -0.31568629 0.23333333 0.5 0.49215686 0.5
0.45294118 0.21764706 0.23333333 0.30000001 0.5 0.49215686
0.49607843 0.06862745 -0.22941177 -0.5 -0.49607843 -0.5 -0.5
-0.5 -0.5 -0.5 ]
[-0.5 -0.5 -0.5 -0.49607843 -0.5 -0.44901961
-0.37450981 0.17058824 0.31176472 0.5 0.49215686 0.5
0.2764706 -0.43333334 -0.10392157 0.22156863 0.44901961 0.5 0.5
0.37843138 0.04901961 -0.34705883 -0.5 -0.49215686 -0.5 -0.5
-0.5 -0.5 ]
[-0.5 -0.5 -0.5 -0.49607843 -0.5 -0.36666667
-0.46078432 0.04117647 0.5 0.48431373 0.49607843 0.49215686
0.0254902 -0.49215686 -0.23333333 0.1627451 0.46078432 0.5 0.5
0.46862745 0.10784314 -0.22941177 -0.5 -0.49215686 -0.5 -0.5
-0.5 -0.5 ]
[-0.5 -0.5 -0.5 -0.5 -0.48823529 -0.43333334
-0.1509804 0.4254902 0.44901961 0.5 0.5 0.33137256
-0.06078431 -0.42156863 -0.48431373 -0.09607843 0.24509804 0.37450981
0.5 0.5 0.37450981 -0.20196079 -0.44117647 -0.5
-0.49607843 -0.5 -0.5 -0.5 ]
[-0.5 -0.5 -0.5 -0.5 -0.48823529 -0.41764706
0.18627451 0.48823529 0.49215686 0.49607843 0.42941177 0.0882353
-0.28823531 -0.5 -0.49215686 -0.36274511 0.1509804 0.45686275
0.49607843 0.5 0.41764706 0.05294118 -0.26862746 -0.5
-0.49607843 -0.5 -0.5 -0.5 ]
[-0.5 -0.5 -0.5 -0.49607843 -0.5 -0.19803922
0.40588236 0.5 0.5 0.49607843 0.36274511 0.0372549
-0.3509804 -0.5 -0.46862745 -0.37450981 -0.10784314 0.47254902
0.49215686 0.49215686 0.5 0.15490197 -0.37843138 -0.40196079
-0.48431373 -0.5 -0.5 -0.5 ]
[-0.5 -0.5 -0.5 -0.5 -0.38627452 -0.21372549
0.24117647 0.5 0.5 0.42941177 0.21764706 -0.19019608
-0.5 -0.49215686 -0.5 -0.44509804 -0.17058824 0.12352941
0.5 0.49215686 0.48823529 0.5 -0.14705883 -0.44117647
-0.46470588 -0.5 -0.5 -0.5 ]
[-0.5 -0.5 -0.49607843 -0.48823529 -0.33137256 0.17450981
0.46470588 0.49607843 0.5 0.46470588 0.29215688 0.06078431
-0.11960784 -0.26078433 -0.04509804 -0.17450981 -0.12745099 0.38235295
0.42941177 0.49215686 0.5 0.47647059 0.22941177 -0.3509804
-0.48431373 -0.5 -0.5 -0.5 ]
[-0.5 -0.49607843 -0.5 -0.42156863 -0.1627451 0.35882354
0.49607843 0.5 0.5 0.47254902 0.44117647 0.30784315
0.24117647 0.42941177 0.18627451 0.28823531 0.35882354 0.4254902
0.45294118 0.49607843 0.5 0.48039216 0.43333334 -0.21372549
-0.4254902 -0.5 -0.49607843 -0.5 ]
[-0.5 -0.48823529 -0.48823529 -0.31568629 0.10392157 0.42156863
0.5 0.49607843 0.49607843 0.5 0.5 0.5 0.5
0.49607843 0.46078432 0.49215686 0.47647059 0.5 0.5 0.5
0.5 0.48823529 0.5 0.16666667 -0.36666667 -0.48431373
-0.5 -0.5 ]
[-0.5 -0.48431373 -0.48431373 -0.07254902 0.38627452 0.39411765
0.5 0.49215686 0.5 0.49607843 0.47647059 0.48823529
0.5 0.49215686 0.49607843 0.49215686 0.5 0.5
0.49215686 0.49215686 0.5 0.48823529 0.5 0.33137256
-0.33137256 -0.45294118 -0.5 -0.5 ]
[-0.49607843 -0.5 -0.43333334 -0.00196078 0.47254902 0.45686275
0.5 0.5 0.48431373 0.48039216 0.48039216 0.48431373
0.4137255 0.49215686 0.40980393 0.44901961 0.38627452 0.46862745
0.48823529 0.5 0.49607843 0.49607843 0.5 0.36666667
-0.16666667 -0.4137255 -0.5 -0.49607843]
[-0.5 -0.47254902 -0.14313726 0.21764706 0.47647059 0.5
0.49607843 0.47647059 0.06470589 -0.03333334 0.26862746 -0.04509804
-0.17058824 -0.10392157 -0.17058824 0.06470589 0.1 -0.19803922
-0.02156863 0.31176472 0.46470588 0.45294118 0.5 0.49607843
0.26078433 -0.36274511 -0.5 -0.49215686]
[-0.48431373 -0.33529413 -0.04901961 0.33137256 0.5 0.48431373
0.5 0.13529412 -0.21372549 -0.36274511 -0.5 -0.46078432
-0.44509804 -0.5 -0.43725491 -0.47647059 -0.36666667 -0.4254902
-0.47647059 -0.01764706 0.23333333 0.38235295 0.5 0.49215686
0.31960785 0.15490197 -0.5 -0.49607843]
[-0.44901961 -0.30784315 0.09607843 0.48039216 0.49607843 0.5
0.43333334 0.00980392 -0.29215688 -0.47254902 -0.5 -0.49215686
-0.49607843 -0.48431373 -0.49607843 -0.49607843 -0.5 -0.49215686
-0.5 -0.43725491 0.02156863 0.44509804 0.5 0.5
0.4254902 0.07254902 -0.11960784 -0.48431373]
[-0.39803922 -0.12745099 0.3509804 0.5 0.48823529 0.43725491
0.19019608 -0.11176471 -0.43725491 -0.5 -0.49607843 -0.5 -0.5
-0.5 -0.5 -0.5 -0.49607843 -0.49607843 -0.48823529
-0.5 -0.25686276 0.39803922 0.42156863 0.49607843 0.48039216
0.36274511 0.00588235 -0.5 ]
[-0.33137256 0.21764706 0.48431373 0.5 0.4254902 0.44117647
-0.02156863 -0.40980393 -0.44509804 -0.5 -0.5 -0.5 -0.5
-0.5 -0.5 -0.5 -0.5 -0.5 -0.49607843
-0.5 -0.41764706 0.1627451 0.44901961 0.45294118 0.5 0.5
-0.00980392 -0.34313726]
[-0.33137256 -0.02941176 -0.00588235 0.08431373 0.33137256 0.02156863
-0.40980393 -0.49215686 -0.49215686 -0.5 -0.5 -0.5 -0.5
-0.5 -0.5 -0.5 -0.5 -0.5 -0.49607843
-0.5 -0.35490197 -0.2372549 0.19803922 0.29607844 0.01372549
0.20196079 -0.17843138 -0.45294118]
[-0.48431373 -0.40196079 -0.37450981 -0.48039216 -0.45294118 -0.46470588
-0.49607843 -0.49607843 -0.5 -0.5 -0.5 -0.5 -0.5
-0.5 -0.5 -0.5 -0.5 -0.5 -0.5 -0.5
-0.46470588 -0.4254902 -0.48431373 -0.38235295 -0.45686275 -0.46862745
-0.19803922 -0.44901961]]
Merge and prune the training data as needed. Depending on your computer setup, you might not be able to fit it all in memory, and you can tune train_size
as needed. The labels will be stored into a separate array of integers 0 through 9.
Also create a validation dataset for hyperparameter tuning.
def make_arrays(nb_rows, img_size):
if nb_rows:
dataset = np.ndarray((nb_rows, img_size, img_size), dtype=np.float32)
labels = np.ndarray(nb_rows, dtype=np.int32)
else:
dataset, labels = None, None
return dataset, labels
def merge_datasets(pickle_files, train_size, valid_size=0):
num_classes = len(pickle_files)
valid_dataset, valid_labels = make_arrays(valid_size, image_size)
train_dataset, train_labels = make_arrays(train_size, image_size)
vsize_per_class = valid_size // num_classes
tsize_per_class = train_size // num_classes
start_v, start_t = 0, 0
end_v, end_t = vsize_per_class, tsize_per_class
end_l = vsize_per_class+tsize_per_class
for label, pickle_file in enumerate(pickle_files):
try:
with open(pickle_file, 'rb') as f:
letter_set = pickle.load(f)
# let's shuffle the letters to have random validation and training set
np.random.shuffle(letter_set)
if valid_dataset is not None:
valid_letter = letter_set[:vsize_per_class, :, :]
valid_dataset[start_v:end_v, :, :] = valid_letter
valid_labels[start_v:end_v] = label
start_v += vsize_per_class
end_v += vsize_per_class
train_letter = letter_set[vsize_per_class:end_l, :, :]
train_dataset[start_t:end_t, :, :] = train_letter
train_labels[start_t:end_t] = label
start_t += tsize_per_class
end_t += tsize_per_class
except Exception as e:
print('Unable to process data from', pickle_file, ':', e)
raise
return valid_dataset, valid_labels, train_dataset, train_labels
train_size = 200000
valid_size = 10000
test_size = 10000
valid_dataset, valid_labels, train_dataset, train_labels = merge_datasets(
train_datasets, train_size, valid_size)
_, _, test_dataset, test_labels = merge_datasets(test_datasets, test_size)
print('Training:', train_dataset.shape, train_labels.shape)
print('Validation:', valid_dataset.shape, valid_labels.shape)
print('Testing:', test_dataset.shape, test_labels.shape)
Training: (200000, 28, 28) (200000,)
Validation: (10000, 28, 28) (10000,)
Testing: (10000, 28, 28) (10000,)
Problem 3 ——— Another check: we expect the data to be balanced across classes. Verify that.
from collections import Counter
Counter(train_labels), Counter(test_labels), Counter(valid_labels)
(Counter({0: 20000,
1: 20000,
2: 20000,
3: 20000,
4: 20000,
5: 20000,
6: 20000,
7: 20000,
8: 20000,
9: 20000}),
Counter({0: 1000,
1: 1000,
2: 1000,
3: 1000,
4: 1000,
5: 1000,
6: 1000,
7: 1000,
8: 1000,
9: 1000}),
Counter({0: 1000,
1: 1000,
2: 1000,
3: 1000,
4: 1000,
5: 1000,
6: 1000,
7: 1000,
8: 1000,
9: 1000}))
Next, we’ll randomize the data. It’s important to have the labels well shuffled for the training and test distributions to match.
def randomize(dataset, labels):
permutation = np.random.permutation(labels.shape[0])
shuffled_dataset = dataset[permutation,:,:]
shuffled_labels = labels[permutation]
return shuffled_dataset, shuffled_labels
train_dataset, train_labels = randomize(train_dataset, train_labels)
test_dataset, test_labels = randomize(test_dataset, test_labels)
valid_dataset, valid_labels = randomize(valid_dataset, valid_labels)
Problem 4 ——— Convince yourself that the data is still good after shuffling!
test_labels.shape
(10000,)
Finally, let’s save the data for later reuse:
pickle_file = os.path.join(data_root, 'notMNIST.pickle')
try:
f = open(pickle_file, 'wb')
save = {
'train_dataset': train_dataset,
'train_labels': train_labels,
'valid_dataset': valid_dataset,
'valid_labels': valid_labels,
'test_dataset': test_dataset,
'test_labels': test_labels,
}
pickle.dump(save, f, pickle.HIGHEST_PROTOCOL)
f.close()
except Exception as e:
print('Unable to save data to', pickle_file, ':', e)
raise
statinfo = os.stat(pickle_file)
print('Compressed pickle size:', statinfo.st_size)
Compressed pickle size: 690800512
Problem 5 ———
By construction, this dataset might contain a lot of overlapping samples, including training data that’s also contained in the validation and test set! Overlap between training and test can skew the results if you expect to use your model in an environment where there is never an overlap, but are actually ok if you expect to see training samples recur when you use it. Measure how much overlap there is between training, validation and test samples.
Optional questions:
print(train_dataset.shape)
print(train_labels.shape)
print(valid_dataset.shape)
print(valid_labels.shape)
print(test_dataset.shape)
print(test_labels.shape)
print(train_labels[0:20])
(200000, 28, 28)
(200000,)
(10000, 28, 28)
(10000,)
(10000, 28, 28)
(10000,)
[4 9 6 2 7 3 5 9 6 4 7 6 0 1 8 0 1 9 6 5]
Problem 6 ———
Let’s get an idea of what an off-the-shelf classifier can give you on this data. It’s always good to check that there is something to learn, and that it’s a problem that is not so trivial that a canned solution solves it.
Train a simple model on this data using 50, 100, 1000 and 5000 training samples. Hint: you can use the LogisticRegression model from sklearn.linear_model.
Optional question: train an off-the-shelf model on all the data!
import tensorflow as tf
def one_hot(datas, classes = 10) :
result = list()
for data in datas :
encoded = list(0 for _ in range(classes))
encoded[int(data)] = 1
result.append(encoded)
return np.array(result)
y = one_hot(test_labels)
for i in range(20):
print(test_labels[i],y[i])
3 [0 0 0 1 0 0 0 0 0 0]
6 [0 0 0 0 0 0 1 0 0 0]
4 [0 0 0 0 1 0 0 0 0 0]
7 [0 0 0 0 0 0 0 1 0 0]
7 [0 0 0 0 0 0 0 1 0 0]
7 [0 0 0 0 0 0 0 1 0 0]
7 [0 0 0 0 0 0 0 1 0 0]
1 [0 1 0 0 0 0 0 0 0 0]
7 [0 0 0 0 0 0 0 1 0 0]
4 [0 0 0 0 1 0 0 0 0 0]
6 [0 0 0 0 0 0 1 0 0 0]
0 [1 0 0 0 0 0 0 0 0 0]
9 [0 0 0 0 0 0 0 0 0 1]
0 [1 0 0 0 0 0 0 0 0 0]
0 [1 0 0 0 0 0 0 0 0 0]
2 [0 0 1 0 0 0 0 0 0 0]
2 [0 0 1 0 0 0 0 0 0 0]
8 [0 0 0 0 0 0 0 0 1 0]
2 [0 0 1 0 0 0 0 0 0 0]
9 [0 0 0 0 0 0 0 0 0 1]
train_dataset = np.reshape(train_dataset, (200000, -1))
train_labels = one_hot(train_labels)
valid_dataset = np.reshape(valid_dataset, (10000, -1))
valid_labels = one_hot(valid_labels)
print(train_dataset.shape, train_labels.shape)
(200000, 784) (200000, 10)
# HyperParameters
data_len = 100000
learning_rate = 0.0001
total_steps = 15000
# Hyperparameters for Stochastic Gradient Descent
mini_batch_size = 2000
train_size = train_dataset.shape[0]
# Feed
train_x = tf.placeholder(shape = [None , 784] , dtype =tf.float32 )
train_y = tf.placeholder(shape = [None, 10] , dtype = tf.float32 )
# Neural Network Model
with tf.variable_scope("layer_1") as scope :
w1 = tf.get_variable(name="w1",
shape = (784, 50),
initializer = tf.random_normal_initializer(mean = 0.0, stddev = 0.01))
b1 = tf.get_variable(name="b1",
shape =(50),
initializer=tf.random_normal_initializer(mean = 0.0, stddev = 0.01))
with tf.variable_scope("layer_2") as scope :
w2 = tf.get_variable(name="w2",
shape = (50, 10),
initializer = tf.random_normal_initializer(mean = 0.0, stddev = 0.01))
b2 = tf.get_variable(name="b2",
shape =(10),
initializer=tf.random_normal_initializer(mean = 0.0, stddev = 0.01))
#operations
layer1 = tf.nn.relu( tf.add(tf.matmul(train_x, w1) , b1) )
logits = tf.add(tf.matmul(layer1, w2), b2)
prob = tf.nn.softmax(logits)
loss = tf.nn.softmax_cross_entropy_with_logits(labels = train_y , logits = logits) #tf.reduce_sum(tf.square(prob - train_y))
loss_sum = tf.reduce_sum(loss)
optimizer = tf.train.GradientDescentOptimizer(learning_rate = learning_rate)
training = optimizer.minimize(loss_sum)
init = tf.global_variables_initializer()
loss_log = list()
step_log = list()
with tf.Session() as sess :
sess.run(init)
for step in range(total_steps) : #stochastic gradient descent
#mini_batch 획득
batch_mask = np.random.choice(train_size, mini_batch_size)
mini_batch_dataset = train_dataset[batch_mask]
mini_batch_labels = train_labels[batch_mask]
#획득한 mini_batch 만을 통해 gradient descent 실행해 W,b parameter update
sess.run(training, feed_dict ={train_x : mini_batch_dataset, train_y : mini_batch_labels } )
ls = sess.run(loss_sum , feed_dict = {train_x : mini_batch_dataset, train_y : mini_batch_labels})
loss_log.append(ls)
step_log.append(step)
if step % 100 == 0 :
print("step :" , step, " loss in current mini_batch : " ,ls)
print("Finish stochastic gradient descent Optimization !")
prediction = tf.equal(tf.argmax(prob, 1), tf.argmax(train_y , 1))
accuracy = tf.reduce_mean(tf.cast(prediction, "float"))
ac = sess.run(accuracy , feed_dict = {train_x : valid_dataset , train_y : valid_labels})
print("Accuracy for validation set:", ac)
step : 0 loss in current mini_batch : 4601.67
step : 100 loss in current mini_batch : 1326.26
step : 200 loss in current mini_batch : 1195.37
step : 300 loss in current mini_batch : 1190.59
step : 400 loss in current mini_batch : 1132.75
step : 500 loss in current mini_batch : 1044.28
step : 600 loss in current mini_batch : 1068.86
step : 700 loss in current mini_batch : 974.242
step : 800 loss in current mini_batch : 1004.78
step : 900 loss in current mini_batch : 965.052
step : 1000 loss in current mini_batch : 1032.54
step : 1100 loss in current mini_batch : 948.49
step : 1200 loss in current mini_batch : 992.098
step : 1300 loss in current mini_batch : 898.133
step : 1400 loss in current mini_batch : 937.777
step : 1500 loss in current mini_batch : 907.827
step : 1600 loss in current mini_batch : 965.427
step : 1700 loss in current mini_batch : 918.821
step : 1800 loss in current mini_batch : 862.558
step : 1900 loss in current mini_batch : 920.642
step : 2000 loss in current mini_batch : 908.376
step : 2100 loss in current mini_batch : 952.623
step : 2200 loss in current mini_batch : 922.263
step : 2300 loss in current mini_batch : 759.279
step : 2400 loss in current mini_batch : 884.643
step : 2500 loss in current mini_batch : 820.881
step : 2600 loss in current mini_batch : 895.333
step : 2700 loss in current mini_batch : 813.88
step : 2800 loss in current mini_batch : 837.536
step : 2900 loss in current mini_batch : 761.909
step : 3000 loss in current mini_batch : 796.58
step : 3100 loss in current mini_batch : 831.549
step : 3200 loss in current mini_batch : 762.656
step : 3300 loss in current mini_batch : 704.09
step : 3400 loss in current mini_batch : 755.157
step : 3500 loss in current mini_batch : 753.041
step : 3600 loss in current mini_batch : 757.668
step : 3700 loss in current mini_batch : 717.796
step : 3800 loss in current mini_batch : 680.764
step : 3900 loss in current mini_batch : 722.954
step : 4000 loss in current mini_batch : 729.337
step : 4100 loss in current mini_batch : 716.981
step : 4200 loss in current mini_batch : 776.64
step : 4300 loss in current mini_batch : 704.61
step : 4400 loss in current mini_batch : 717.521
step : 4500 loss in current mini_batch : 747.119
step : 4600 loss in current mini_batch : 720.399
step : 4700 loss in current mini_batch : 723.838
step : 4800 loss in current mini_batch : 728.95
step : 4900 loss in current mini_batch : 768.013
step : 5000 loss in current mini_batch : 762.636
step : 5100 loss in current mini_batch : 674.274
step : 5200 loss in current mini_batch : 725.508
step : 5300 loss in current mini_batch : 644.987
step : 5400 loss in current mini_batch : 700.892
step : 5500 loss in current mini_batch : 712.414
step : 5600 loss in current mini_batch : 686.918
step : 5700 loss in current mini_batch : 729.377
step : 5800 loss in current mini_batch : 659.036
step : 5900 loss in current mini_batch : 717.935
step : 6000 loss in current mini_batch : 717.452
step : 6100 loss in current mini_batch : 751.168
step : 6200 loss in current mini_batch : 624.755
step : 6300 loss in current mini_batch : 671.416
step : 6400 loss in current mini_batch : 714.373
step : 6500 loss in current mini_batch : 692.471
step : 6600 loss in current mini_batch : 657.005
step : 6700 loss in current mini_batch : 664.928
step : 6800 loss in current mini_batch : 748.706
step : 6900 loss in current mini_batch : 708.959
step : 7000 loss in current mini_batch : 689.302
step : 7100 loss in current mini_batch : 576.379
step : 7200 loss in current mini_batch : 633.845
step : 7300 loss in current mini_batch : 634.733
step : 7400 loss in current mini_batch : 633.668
step : 7500 loss in current mini_batch : 651.758
step : 7600 loss in current mini_batch : 682.321
step : 7700 loss in current mini_batch : 693.369
step : 7800 loss in current mini_batch : 692.234
step : 7900 loss in current mini_batch : 669.476
step : 8000 loss in current mini_batch : 682.369
step : 8100 loss in current mini_batch : 625.788
step : 8200 loss in current mini_batch : 677.573
step : 8300 loss in current mini_batch : 657.304
step : 8400 loss in current mini_batch : 661.556
step : 8500 loss in current mini_batch : 651.978
step : 8600 loss in current mini_batch : 605.606
step : 8700 loss in current mini_batch : 595.129
step : 8800 loss in current mini_batch : 605.027
step : 8900 loss in current mini_batch : 632.582
step : 9000 loss in current mini_batch : 688.383
step : 9100 loss in current mini_batch : 635.524
step : 9200 loss in current mini_batch : 660.438
step : 9300 loss in current mini_batch : 593.719
step : 9400 loss in current mini_batch : 655.906
step : 9500 loss in current mini_batch : 641.276
step : 9600 loss in current mini_batch : 620.561
step : 9700 loss in current mini_batch : 639.464
step : 9800 loss in current mini_batch : 713.408
step : 9900 loss in current mini_batch : 635.882
step : 10000 loss in current mini_batch : 616.373
step : 10100 loss in current mini_batch : 635.377
step : 10200 loss in current mini_batch : 617.724
step : 10300 loss in current mini_batch : 651.883
step : 10400 loss in current mini_batch : 624.633
step : 10500 loss in current mini_batch : 689.182
step : 10600 loss in current mini_batch : 643.464
step : 10700 loss in current mini_batch : 632.735
step : 10800 loss in current mini_batch : 657.008
step : 10900 loss in current mini_batch : 727.608
step : 11000 loss in current mini_batch : 604.399
step : 11100 loss in current mini_batch : 678.079
step : 11200 loss in current mini_batch : 602.736
step : 11300 loss in current mini_batch : 628.098
step : 11400 loss in current mini_batch : 631.935
step : 11500 loss in current mini_batch : 486.739
step : 11600 loss in current mini_batch : 600.978
step : 11700 loss in current mini_batch : 661.016
step : 11800 loss in current mini_batch : 658.212
step : 11900 loss in current mini_batch : 591.686
step : 12000 loss in current mini_batch : 561.555
step : 12100 loss in current mini_batch : 590.612
step : 12200 loss in current mini_batch : 625.416
step : 12300 loss in current mini_batch : 631.534
step : 12400 loss in current mini_batch : 576.341
step : 12500 loss in current mini_batch : 561.747
step : 12600 loss in current mini_batch : 610.397
step : 12700 loss in current mini_batch : 646.197
step : 12800 loss in current mini_batch : 571.696
step : 12900 loss in current mini_batch : 597.438
step : 13000 loss in current mini_batch : 571.774
step : 13100 loss in current mini_batch : 605.854
step : 13200 loss in current mini_batch : 587.422
step : 13300 loss in current mini_batch : 629.188
step : 13400 loss in current mini_batch : 580.812
step : 13500 loss in current mini_batch : 618.708
step : 13600 loss in current mini_batch : 601.535
step : 13700 loss in current mini_batch : 592.996
step : 13800 loss in current mini_batch : 513.31
step : 13900 loss in current mini_batch : 573.671
step : 14000 loss in current mini_batch : 568.322
step : 14100 loss in current mini_batch : 534.002
step : 14200 loss in current mini_batch : 530.757
step : 14300 loss in current mini_batch : 596.364
step : 14400 loss in current mini_batch : 617.588
step : 14500 loss in current mini_batch : 612.434
step : 14600 loss in current mini_batch : 544.609
step : 14700 loss in current mini_batch : 573.086
step : 14800 loss in current mini_batch : 658.637
step : 14900 loss in current mini_batch : 682.304
Finish stochastic gradient descent Optimization !
Accuracy for validation set: 0.8785
#show loss changes
plt.plot(step_log[:10000], loss_log[:10000], "r")
plt.ylabel("loss")
plt.show()
#Run the model
with tf.Session() as sess : sess.run(init)
for step in range(3000) :
sess.run(training ,feed_dict = {train_x : train_dataset[:data_len], train_y : train_labels[:data_len]})
ls = sess.run(loss_sum ,feed_dict = {train_x : train_dataset[:data_len], train_y : train_labels[:data_len]})
if step % 100 == 0 :
print("step:", step, "loss:" ,ls)
prediction = tf.equal(tf.argmax(prob ,1), tf.argmax(train_y, 1))
accuracy = tf.reduce_mean(tf.cast(prediction, "float"))
ac = sess.run(accuracy, feed_dict = {train_x : valid_dataset, train_y : valid_labels})
print("\n\n accuracy : " , ac)