ResearchMethods/mini_proj/Load_Images.py

'''
Created by Tony Silvestre to prepare images for use from a Kaggle Where's Waldo dataset
'''
import os
import numpy as np
from matplotlib import pyplot as plt
import math
import cv2

def gen_data(w_path, n_w_path):
    waldo_file_list = os.listdir(os.path.join(w_path))
    total_w = len(waldo_file_list)
    not_waldo_file_list = os.listdir(os.path.join(n_w_path))
    total_nw = len(not_waldo_file_list)
    imgs_raw = []           # Images
    imgs_lbl = []           # Image labels

    #imgs_raw = np.array([np.array(imread(wdir + "waldo/"+fname)) for fname in os.listdir(wdir + "waldo")])
    i = 0
    for image_name in waldo_file_list:
        pic = cv2.imread(os.path.join(w_path, image_name))              # NOTE: cv2.imread() returns a numpy array in BGR not RGB
        pic = pic/255                                                   # Scaling images down to values of 0-255
        imgs_raw.append(np.rollaxis(pic, -1))                           # rolls colour axis to 0
        imgs_lbl.append(1)                                              # Value of 1 as Waldo is present in the image

        print('Completed: {0}/{1} Waldo images'.format(i+1, total_w))
        i += 1

    i = 0
    for image_name in not_waldo_file_list:
        pic = cv2.imread(os.path.join(n_w_path, image_name))
        pic = pic/255                                                   # Scaling images down to values of 0-255
        imgs_raw.append(np.rollaxis(pic, -1))
        imgs_lbl.append(0)

        print('Completed: {0}/{1} non-Waldo images'.format(i+1, total_nw))
        i += 1

    ## Randomise and split data into training and test sets
    # Code was modified from code written by: Kyle O'Brien (medium.com/@kylepob61392)
    n_images = len(imgs_raw)
    TRAIN_TEST_SPLIT = 0.75

    # Split at the given index
    split_index = int(TRAIN_TEST_SPLIT * n_images)
    shuffled_indices = np.random.permutation(n_images)
    train_indices = shuffled_indices[0:split_index]
    test_indices = shuffled_indices[split_index:]

    train_data = []
    train_lbl = []
    test_data = []
    test_lbl = []

    # Split the images and the labels
    for index in train_indices:
        train_data.append(imgs_raw[index])
        train_lbl.append(imgs_lbl[index])

    for index in test_indices:
        test_data.append(imgs_raw[index])
        test_lbl.append(imgs_lbl[index])

    # # Calculate what 30% of each set is
    # third_of_w = math.floor(0.3*total_w)
    # third_of_nw = math.floor(0.3*total_nw)

    # # Split data into training and test data (60%/30%)
    # train_data = np.append(imgs_raw[(third_of_w+1):total_w], imgs_raw[(total_w + third_of_nw + 1):len(imgs_raw)-1], axis=0)
    # train_lbl = np.append(imgs_lbl[(third_of_w+1):total_w], imgs_lbl[(total_w + third_of_nw + 1):len(imgs_lbl)-1], axis=0)
    # # If axis not given, both arrays are flattened before being appended
    # test_data = np.append(imgs_raw[0:third_of_w], imgs_raw[total_w:(total_w + third_of_nw)], axis=0)
    # test_lbl = np.append(imgs_lbl[0:third_of_w], imgs_lbl[total_w:(total_w + third_of_nw)], axis=0)

    try:
        # Save the data as numpy files
        np.save('Waldo_train_data.npy', train_data)
        np.save('Waldo_train_lbl.npy', train_lbl)
        np.save('Waldo_test_data.npy', test_data)
        np.save('Waldo_test_lbl.npy', test_lbl)
        print("All data saved")
    except:
        print("ERROR: Data may not be completely saved")


if __name__ == "__main__":
    # Paths to the Waldo images
    waldo_path = 'waldo_data/64/waldo'
    n_waldo_path = 'waldo_data/64/notwaldo'

    gen_data(waldo_path, n_waldo_path)