Merge branch 'master' of https://github.com/Dekker1/ResearchMethods
230
.gitignore
vendored
@ -1,3 +1,229 @@
|
||||
wk7/.ipynb_checkpoints/
|
||||
wk8/.ipynb_checkpoints/
|
||||
.ipynb_checkpoints/
|
||||
|
||||
*~
|
||||
## Core latex/pdflatex auxiliary files:
|
||||
*.aux
|
||||
*.lof
|
||||
*.log
|
||||
*.lot
|
||||
*.fls
|
||||
*.out
|
||||
*.toc
|
||||
*.fmt
|
||||
*.fot
|
||||
*.cb
|
||||
*.cb2
|
||||
|
||||
## Intermediate documents:
|
||||
*.dvi
|
||||
*-converted-to.*
|
||||
# these rules might exclude image files for figures etc.
|
||||
# *.ps
|
||||
# *.eps
|
||||
# *.pdf
|
||||
|
||||
## Generated if empty string is given at "Please type another file name for output:"
|
||||
wk7/week7.pdf
|
||||
wk8/week8.pdf
|
||||
wk9/week9.pdf
|
||||
wk10/week10.pdf
|
||||
|
||||
## Waldo data for the mini_proj
|
||||
mini_proj/waldo_data/*
|
||||
|
||||
## Bibliography auxiliary files (bibtex/biblatex/biber):
|
||||
*.bbl
|
||||
*.bcf
|
||||
*.blg
|
||||
*-blx.aux
|
||||
*-blx.bib
|
||||
*.run.xml
|
||||
|
||||
## Build tool auxiliary files:
|
||||
*.fdb_latexmk
|
||||
*.synctex
|
||||
*.synctex(busy)
|
||||
*.synctex.gz
|
||||
*.synctex.gz(busy)
|
||||
*.pdfsync
|
||||
|
||||
## Auxiliary and intermediate files from other packages:
|
||||
# algorithms
|
||||
*.alg
|
||||
*.loa
|
||||
|
||||
# achemso
|
||||
acs-*.bib
|
||||
|
||||
# amsthm
|
||||
*.thm
|
||||
|
||||
# beamer
|
||||
*.nav
|
||||
*.pre
|
||||
*.snm
|
||||
*.vrb
|
||||
|
||||
# changes
|
||||
*.soc
|
||||
|
||||
# cprotect
|
||||
*.cpt
|
||||
|
||||
# elsarticle (documentclass of Elsevier journals)
|
||||
*.spl
|
||||
|
||||
# endnotes
|
||||
*.ent
|
||||
|
||||
# fixme
|
||||
*.lox
|
||||
|
||||
# feynmf/feynmp
|
||||
*.mf
|
||||
*.mp
|
||||
*.t[1-9]
|
||||
*.t[1-9][0-9]
|
||||
*.tfm
|
||||
|
||||
#(r)(e)ledmac/(r)(e)ledpar
|
||||
*.end
|
||||
*.?end
|
||||
*.[1-9]
|
||||
*.[1-9][0-9]
|
||||
*.[1-9][0-9][0-9]
|
||||
*.[1-9]R
|
||||
*.[1-9][0-9]R
|
||||
*.[1-9][0-9][0-9]R
|
||||
*.eledsec[1-9]
|
||||
*.eledsec[1-9]R
|
||||
*.eledsec[1-9][0-9]
|
||||
*.eledsec[1-9][0-9]R
|
||||
*.eledsec[1-9][0-9][0-9]
|
||||
*.eledsec[1-9][0-9][0-9]R
|
||||
|
||||
# glossaries
|
||||
*.acn
|
||||
*.acr
|
||||
*.glg
|
||||
*.glo
|
||||
*.gls
|
||||
*.glsdefs
|
||||
|
||||
# gnuplottex
|
||||
*-gnuplottex-*
|
||||
|
||||
# gregoriotex
|
||||
*.gaux
|
||||
*.gtex
|
||||
|
||||
# hyperref
|
||||
*.brf
|
||||
|
||||
# knitr
|
||||
*-concordance.tex
|
||||
# TODO Comment the next line if you want to keep your tikz graphics files
|
||||
*.tikz
|
||||
*-tikzDictionary
|
||||
|
||||
# listings
|
||||
*.lol
|
||||
|
||||
# makeidx
|
||||
*.idx
|
||||
*.ilg
|
||||
*.ind
|
||||
*.ist
|
||||
|
||||
# minitoc
|
||||
*.maf
|
||||
*.mlf
|
||||
*.mlt
|
||||
*.mtc[0-9]*
|
||||
*.slf[0-9]*
|
||||
*.slt[0-9]*
|
||||
*.stc[0-9]*
|
||||
|
||||
# minted
|
||||
_minted*
|
||||
*.pyg
|
||||
|
||||
# morewrites
|
||||
*.mw
|
||||
|
||||
# nomencl
|
||||
*.nlo
|
||||
|
||||
# pax
|
||||
*.pax
|
||||
|
||||
# pdfpcnotes
|
||||
*.pdfpc
|
||||
|
||||
# sagetex
|
||||
*.sagetex.sage
|
||||
*.sagetex.py
|
||||
*.sagetex.scmd
|
||||
|
||||
# scrwfile
|
||||
*.wrt
|
||||
|
||||
# sympy
|
||||
*.sout
|
||||
*.sympy
|
||||
sympy-plots-for-*.tex/
|
||||
|
||||
# pdfcomment
|
||||
*.upa
|
||||
*.upb
|
||||
|
||||
# pythontex
|
||||
*.pytxcode
|
||||
pythontex-files-*/
|
||||
|
||||
# thmtools
|
||||
*.loe
|
||||
|
||||
# TikZ & PGF
|
||||
*.dpth
|
||||
*.md5
|
||||
*.auxlock
|
||||
|
||||
# todonotes
|
||||
*.tdo
|
||||
|
||||
# easy-todo
|
||||
*.lod
|
||||
|
||||
# xindy
|
||||
*.xdy
|
||||
|
||||
# xypic precompiled matrices
|
||||
*.xyc
|
||||
|
||||
# endfloat
|
||||
*.ttt
|
||||
*.fff
|
||||
|
||||
# Latexian
|
||||
TSWLatexianTemp*
|
||||
|
||||
## Editors:
|
||||
# WinEdt
|
||||
*.bak
|
||||
*.sav
|
||||
|
||||
# Texpad
|
||||
.texpadtmp
|
||||
|
||||
# Kile
|
||||
*.backup
|
||||
|
||||
# KBibTeX
|
||||
*~[0-9]*
|
||||
|
||||
# auto folder when using emacs and auctex
|
||||
/auto/*
|
||||
|
||||
# expex forward references with \gathertags
|
||||
*-tags.tex
|
||||
|
66
mini_proj/Load_Images.py
Normal file
@ -0,0 +1,66 @@
|
||||
'''
|
||||
Created by Tony Silvestre to prepare images for use from a Kaggle Where's Waldo dataset
|
||||
'''
|
||||
import os
|
||||
import numpy as np
|
||||
from matplotlib import pyplot as plt
|
||||
import math
|
||||
import cv2
|
||||
|
||||
def gen_data(w_path, n_w_path):
|
||||
waldo_file_list = os.listdir(os.path.join(w_path))
|
||||
total_w = len(waldo_file_list)
|
||||
not_waldo_file_list = os.listdir(os.path.join(n_w_path))
|
||||
total_nw = len(not_waldo_file_list)
|
||||
imgs_raw = [] # Images
|
||||
imgs_lbl = [] # Image labels
|
||||
|
||||
#imgs_raw = np.array([np.array(imread(wdir + "waldo/"+fname)) for fname in os.listdir(wdir + "waldo")])
|
||||
i = 0
|
||||
for image_name in waldo_file_list:
|
||||
pic = cv2.imread(os.path.join(w_path, image_name)) # NOTE: cv2.imread() returns a numpy array in BGR not RGB
|
||||
imgs_raw.append(pic)
|
||||
imgs_lbl.append(1) # Value of 1 as Waldo is present in the image
|
||||
|
||||
print('Completed: {0}/{1} Waldo images'.format(i+1, total_w))
|
||||
i += 1
|
||||
|
||||
i = 0
|
||||
for image_name in not_waldo_file_list:
|
||||
pic = cv2.imread(os.path.join(n_w_path, image_name))
|
||||
imgs_raw.append(pic)
|
||||
imgs_lbl.append(0)
|
||||
|
||||
print('Completed: {0}/{1} non-Waldo images'.format(i+1, total_nw))
|
||||
i += 1
|
||||
|
||||
# Calculate what 30% of each set is
|
||||
third_of_w = math.floor(0.3*total_w)
|
||||
third_of_nw = math.floor(0.3*total_nw)
|
||||
|
||||
# Split data into training and test data (60%/30%)
|
||||
train_data = np.append(imgs_raw[(third_of_w+1):total_w], imgs_raw[(total_w + third_of_nw + 1):len(imgs_raw)-1], axis=0)
|
||||
train_lbl = np.append(imgs_lbl[(third_of_w+1):total_w], imgs_lbl[(total_w + third_of_nw + 1):len(imgs_lbl)-1], axis=0)
|
||||
# If axis not given, both arrays are flattened before being appended
|
||||
test_data = np.append(imgs_raw[0:third_of_w], imgs_raw[total_w:(total_w + third_of_nw)], axis=0)
|
||||
test_lbl = np.append(imgs_lbl[0:third_of_w], imgs_lbl[total_w:(total_w + third_of_nw)], axis=0)
|
||||
|
||||
try:
|
||||
# Save the data as numpy files
|
||||
np.save('Waldo_train_data.npy', train_data)
|
||||
np.save('Waldo_train_lbl.npy', train_lbl)
|
||||
np.save('Waldo_test_data.npy', test_data)
|
||||
np.save('Waldo_test_lbl.npy', test_lbl)
|
||||
print("All data saved")
|
||||
except:
|
||||
print("ERROR: Data may not be completely saved")
|
||||
|
||||
|
||||
def __main__():
|
||||
# Paths to the Waldo images
|
||||
waldo_path = 'waldo_data/64/waldo'
|
||||
n_waldo_path = 'waldo_data/64/notwaldo'
|
||||
|
||||
gen_data(waldo_path, n_waldo_path)
|
||||
|
||||
__main__()
|
BIN
mini_proj/Waldo_test_data.npy
Normal file
BIN
mini_proj/Waldo_test_lbl.npy
Normal file
BIN
mini_proj/Waldo_train_data.npy
Normal file
BIN
mini_proj/Waldo_train_lbl.npy
Normal file
128
mini_proj/waldo_model.py
Normal file
@ -0,0 +1,128 @@
|
||||
import numpy as np
|
||||
import sys
|
||||
import time as t
|
||||
'''
|
||||
from keras.models import Sequential
|
||||
from keras.layers import Dense, Dropout, Activation, Flatten, Reshape, Merge, Permute
|
||||
from keras.layers import Deconvolution2D, Convolution2D, MaxPooling2D, UpSampling2D, ZeroPadding2D
|
||||
from keras.layers import Input
|
||||
from keras.layers.normalization import BatchNormalization
|
||||
from keras.utils import np_utils
|
||||
'''
|
||||
from keras import backend as K
|
||||
K.set_image_dim_ordering('th')
|
||||
np.random.seed(7)
|
||||
|
||||
'''
|
||||
Model definition
|
||||
'''
|
||||
def FCN():
|
||||
## sample structure defined below
|
||||
# inputs = Input((1, w, h))
|
||||
|
||||
# conv1 = Convolution2D(32, 3, 3, activation='relu', border_mode='same')(inputs)
|
||||
# conv1 = Convolution2D(32, 3, 3, activation='relu', border_mode='same')(conv1)
|
||||
# m_pool1 = MaxPooling2D(pool_size=(2, 2))(conv1)
|
||||
|
||||
# conv2 = Convolution2D(64, 3, 3, activation='relu', border_mode='same')(m_pool1)
|
||||
# drop1 = Dropout(0.2)(conv2)
|
||||
# conv2 = Convolution2D(64, 3, 3, activation='relu', border_mode='same')(drop1)
|
||||
# m_pool2 = MaxPooling2D(pool_size=(2, 2))(conv2)
|
||||
|
||||
# conv7 = Convolution2D(512, 3, 3, activation='relu', border_mode='same')(m_pool6)
|
||||
# conv7 = Convolution2D(1, 3, 3, activation='relu', border_mode='same')(conv7)
|
||||
|
||||
# up8x = UpSampling2D(size=(2, 2))(conv16x)
|
||||
# merge8x = merge([up8x, m_pool3], mode='concat', concat_axis=1)
|
||||
# conv8x = Convolution2D(1, 1, 1, activation='relu', border_mode='same')(merge8x)
|
||||
|
||||
# up4x = UpSampling2D(size=(2, 2))(conv8x)
|
||||
# merge4x = merge([up4x, m_pool2], mode='concat', concat_axis=1)
|
||||
# conv4x = Convolution2D(1, 1, 1, activation='relu', border_mode='same')(merge4x)
|
||||
|
||||
# up4x = UpSampling2D(size=(4, 4))(conv4x)
|
||||
# model = Model(input=inputs, output=up4x)
|
||||
# # Optimizer uses recommended Adadelta values
|
||||
# model.compile(optimizer=Adadelta(lr=0.01), loss='categorical_crossentropy', metrics=['accuracy'])
|
||||
return model
|
||||
|
||||
|
||||
## Open data
|
||||
im_train = np.load('Waldo_train_data.npy')
|
||||
lbl_train = np.load('Waldo_test_lbl.npy')
|
||||
im_test = np.load('Waldo_test_data.npy')
|
||||
lbl_test = np.load('Waldo_test_lbl.npy')
|
||||
|
||||
## Define model
|
||||
model = FCN()
|
||||
|
||||
## Define training parameters
|
||||
epochs = 40 # an epoch is one forward pass and back propogation of all training data
|
||||
batch_size = 5
|
||||
#lrate = 0.01
|
||||
#decay = lrate/epochs
|
||||
# epoch - one forward pass and one backward pass of all training data
|
||||
# batch size - number of training example used in one forward/backward pass
|
||||
# (higher batch size uses more memory)
|
||||
# learning rate - controls magnitude of weight changes in training the NN
|
||||
|
||||
## Train model
|
||||
# Purely superficial output
|
||||
sys.stdout.write("\nFitting model")
|
||||
sys.stdout.flush()
|
||||
for i in range(0, 3):
|
||||
t.sleep(0.8)
|
||||
sys.stdout.write('.')
|
||||
sys.stdout.flush()
|
||||
print()
|
||||
|
||||
# Outputs the model structure
|
||||
for i in range(0, len(model.layers)):
|
||||
print("Layer {}: {}".format(i, model.layers[i].output))
|
||||
print('-'*30)
|
||||
|
||||
filepath = "checkpoint.hdf5" # Defines the model checkpoint file
|
||||
checkpoint = ModelCheckpoint(filepath, verbose=1, save_best_only=False) # Defines the checkpoint process
|
||||
callbacks_list = [checkpoint] # Adds the checkpoint process to the list of action performed during training
|
||||
start = t.time() # Records time before training
|
||||
|
||||
# Fits model based on initial parameters
|
||||
model.fit(im_train, lbl_train, nb_epoch=epochs, batch_size=batch_size,
|
||||
verbose=2, shuffle=True, callbacks=callbacks_list)
|
||||
# If getting a value error here, output of network and corresponding lbl_train
|
||||
# data probably don't match
|
||||
end = t.time() # Records time after tranining
|
||||
|
||||
print('Training Duration: {}'.format(end-start))
|
||||
print('-'*30)
|
||||
print("*** Saving FCN model and weights ***")
|
||||
|
||||
'''
|
||||
# *To save model and weights seperately:
|
||||
# save model as json file
|
||||
model_json = model.to_json()
|
||||
with open("UNet_model.json", "w") as json_file:
|
||||
json_file.write(model_json)
|
||||
# save weights as h5 file
|
||||
model.save_weights("UNet_weights.h5")
|
||||
print("\nModel weights and structure have been saved.\n")
|
||||
'''
|
||||
# Save model as one file
|
||||
model.save('Waldo.h5')
|
||||
print("\nModel weights and structure have been saved.\n")
|
||||
|
||||
## Testing the model
|
||||
# Load test data
|
||||
im_test, lbl_test = Load_Images()
|
||||
# Show data stats
|
||||
print('*'*30)
|
||||
print(im_test.shape)
|
||||
print(lbl_test.shape)
|
||||
print('*'*30)
|
||||
start = t.time()
|
||||
# Passes the dataset through the model
|
||||
pred_lbl = model.predict(im_test, verbose=1, batch_size=batch_size)
|
||||
end = t.time()
|
||||
print("Images generated in {} seconds".format(end - start))
|
||||
np.save('Test/predicted_results.npy', pred_lbl)
|
||||
|
25
wk10/week10.tex
Normal file
@ -0,0 +1,25 @@
|
||||
\documentclass[a4paper]{article}
|
||||
% To compile PDF run: latexmk -pdf {filename}.tex
|
||||
|
||||
% Math package
|
||||
\usepackage{amsmath}
|
||||
%enable \cref{...} and \Cref{...} instead of \ref: Type of reference included in the link
|
||||
\usepackage[capitalise,nameinlink]{cleveref}
|
||||
% Enable that parameters of \cref{}, \ref{}, \cite{}, ... are linked so that a reader can click on the number an jump to the target in the document
|
||||
\usepackage{hyperref}
|
||||
% UTF-8 encoding
|
||||
\usepackage[T1]{fontenc}
|
||||
\usepackage[utf8]{inputenc} %support umlauts in the input
|
||||
% Easier compilation
|
||||
\usepackage{bookmark}
|
||||
\usepackage{natbib}
|
||||
\usepackage{graphicx}
|
||||
|
||||
\begin{document}
|
||||
\title{Week 8 - Quantitative data analysis}
|
||||
\author{Kelvin Davis \and Jip J. Dekker\and Tony Silvestere}
|
||||
\maketitle
|
||||
|
||||
|
||||
|
||||
\end{document}
|
BIN
wk7/correlation.png
Normal file
After Width: | Height: | Size: 244 KiB |
BIN
wk7/handdistr.png
Normal file
After Width: | Height: | Size: 146 KiB |
BIN
wk7/handdistr_gender.png
Normal file
After Width: | Height: | Size: 314 KiB |
BIN
wk7/heightrank.png
Normal file
After Width: | Height: | Size: 176 KiB |
BIN
wk7/outlier.png
Normal file
After Width: | Height: | Size: 96 KiB |
BIN
wk7/pointheight.png
Normal file
After Width: | Height: | Size: 144 KiB |
205
wk7/week7.tex
Normal file
@ -0,0 +1,205 @@
|
||||
\documentclass[a4paper]{article}
|
||||
% To compile PDF run: latexmk -pdf {filename}.tex
|
||||
% Math package
|
||||
\usepackage{amsmath}
|
||||
%enable \cref{...} and \Cref{...} instead of \ref: Type of reference included in the link
|
||||
\usepackage[capitalise,nameinlink]{cleveref}
|
||||
% Enable that parameters of \cref{}, \ref{}, \cite{}, ... are linked so that a reader can click on the number an jump to the target in the document
|
||||
\usepackage{hyperref}
|
||||
% UTF-8 encoding
|
||||
\usepackage[T1]{fontenc}
|
||||
\usepackage[utf8]{inputenc} %support umlauts in the input
|
||||
% Easier compilation
|
||||
\usepackage{bookmark}
|
||||
\usepackage{graphicx}
|
||||
|
||||
\begin{document}
|
||||
\title{Week 7 - Evidence and experiments}
|
||||
\author{
|
||||
Jai Bheeman \and Kelvin Davis \and Jip J. Dekker \and Nelson Frew \and Tony
|
||||
Silvestere
|
||||
}
|
||||
\maketitle
|
||||
|
||||
\section{Introduction} \label{sec:introduction}
|
||||
|
||||
In this report we have documented a series of hypothesis tests regarding provided data in high-ranking
|
||||
Tennis players. The focus of these hypotheses concerns a player's handedness with regards to overall
|
||||
ranking. We first provide an overview of how we address these notions, with visualisations and
|
||||
descriptions of our overall methodology. Following this, we then provide a brief discussion of what we
|
||||
can infer given our statistical analysis techniques.
|
||||
|
||||
\section{Method} \label{sec:method}
|
||||
|
||||
We are testing two hypotheses. The first hypothesis that we test is that tall players have an advantage
|
||||
over smaller players. The second hypothesis that we test is that left-handed players have an advantage
|
||||
over right-handed players. To build an intuition of how the data behaves with respect to the hypotheses
|
||||
we are testing, we created visual representations using tools from the Matplotlib, and Seaborn libraries
|
||||
and then we perform statistical tests to measure these effects.
|
||||
|
||||
\subsection{Visualisation} \label{subsec:visualisation}
|
||||
|
||||
\subsubsection{Effect of Height} \label{subsubsec:vheight}
|
||||
|
||||
We started by performing a scatter plot of points earned by players with respect to their heights, to which we were surprised to find a player recorded to approximately 18m tall. This, we found to be somewhat contradictory to the currently held record of 2.72m. Removing this outlier, we can see a sufficient spread in height, points and ranking. We can also see slight discrepancy in height between males and females, and because of this, we perform separate statistical tests on males and females as to remove the effect of the gender. We plot both points with respect to height and height with respect to ranking.
|
||||
|
||||
The plot of height with respect to ranking does not show an explicit relationship between the two variables, however we aim to test this relation in the Results Section.
|
||||
|
||||
\subsubsection{Effect of Handedness} \label{subsubsec:vhand}
|
||||
|
||||
We use distribution plots from Seaborn to visualise the distribution of points earned by left-handed and right-handed players overlapped on the same plot. The visualisation uses a kernel density estimate of the probability density function derived from the sample provided. We also plot separate distributions for male and female players in case there are any noticeable differences between genders.
|
||||
|
||||
\begin{figure}[ht]
|
||||
\centering
|
||||
\label{fig:distr}
|
||||
\includegraphics[width=\textwidth]{correlation.png}
|
||||
\caption{Correlation matrix of the numerical values in the dataset}
|
||||
\end{figure}
|
||||
|
||||
\begin{figure}[ht]
|
||||
\centering
|
||||
\label{fig:distr}
|
||||
\includegraphics[width=\textwidth]{outlier.png}
|
||||
\caption{Scatter plot of points against height with an outlier}
|
||||
\end{figure}
|
||||
|
||||
\begin{figure}[ht]
|
||||
\centering
|
||||
\label{fig:distr}
|
||||
\includegraphics[width=\textwidth]{pointheight.png}
|
||||
\caption{Scatter plot of points against height with the outlier removed}
|
||||
\end{figure}
|
||||
|
||||
\begin{figure}[ht]
|
||||
\centering
|
||||
\label{fig:distr}
|
||||
\includegraphics[width=\textwidth]{heightrank.png}
|
||||
\caption{Scatter plot of height against rank}
|
||||
\end{figure}
|
||||
|
||||
\begin{figure}[ht]
|
||||
\centering
|
||||
\label{fig:distr}
|
||||
\includegraphics[width=\textwidth]{handdistr.png}
|
||||
\caption{Distribution plots of points separated by handedness}
|
||||
\end{figure}
|
||||
|
||||
\begin{figure}[ht]
|
||||
\centering
|
||||
\label{fig:distr}
|
||||
\includegraphics[width=\textwidth]{handdistr_gender.png}
|
||||
\caption{Distribution plots of points separated by handedness for males and distribution plots of points separated by handedness for females}
|
||||
\end{figure}
|
||||
|
||||
\subsection{Statistical Tests} \label{subsec:stattests}
|
||||
|
||||
In testing the first hypothesis, we perform T-tests to analyse the effect of height on the points earned by players. Two T-tests are performed; one for each gender. Each gender of players are separated into two groups; a group of players that scored above the mean number of points and a group of players that scored below the mean number of points and these groups are compared in the T-tests. Later we perform a $\chi^2$ test on the groups together.
|
||||
|
||||
To test the second hypothesis, we use a T-test to measure the effect of handedness and a $\chi^2$ test to measure the difference between the expected values and the observed values and garner a probability that the sample belongs to the $\chi^2$ distribution.
|
||||
|
||||
\section{Results} \label{sec:results}
|
||||
|
||||
We investigate both the advantage of height and the advantage of being
|
||||
left-handed using a $\chi^2$ test and a T-test. For every test we will state
|
||||
the exact hypothesis and the null-hypothesis.
|
||||
|
||||
\subsection{The advantage of height}
|
||||
|
||||
\textbf{$\chi^2$-test:} To test if there is an advantage of being tall we ran
|
||||
a $\chi^2$ with the following hypotheses:\\
|
||||
$H$: Players that are taller have a higher rank \\
|
||||
$H_0$: The rank of a player is independent of their height \\
|
||||
\\
|
||||
To perform the test the players are groups into groups dependant on their
|
||||
rank and if they are taller than the mean height for their gender. The
|
||||
expected data is computed using the chances of being taller than the mean, and
|
||||
the chance of being in the group of rankings. The data used is found in table
|
||||
1.
|
||||
|
||||
\begin{table}[ht]
|
||||
\centering
|
||||
\begin{tabular}{|l|r|r|r|r|}
|
||||
\hline
|
||||
& \textbf{M: 168 - 188} & \textbf{M: 189 - 210} & \textbf{F: 155 - 171} & \textbf{F: 172 - 189} \\ \hline
|
||||
\textbf{1 - 99} & 67 / 73 & 32 / 26 & 38 / 42 & 60 / 55 \\
|
||||
\textbf{100 - 199} & 69 / 72 & 30 / 26 & 31 / 27 & 32 / 36 \\
|
||||
\textbf{200 - 299} & 75 / 68 & 17 / 25 & 18 / 17 & 22 / 23 \\
|
||||
\textbf{300 - 399} & 61 / 60 & 21 / 23 & 11 /12 & 17 / 16 \\
|
||||
\textbf{400 - 499} & 59 / 60 & 22 / 22 & 7 / 6
|
||||
& 7 / 8 \\
|
||||
\hline
|
||||
\end{tabular}
|
||||
\label{tab:chiheight}
|
||||
\caption{Observed / Expected values used for the $\chi^2$-test. The groups are divided by their rank (vertical) and, per gender, their height (horizontal).}
|
||||
\end{table}
|
||||
|
||||
The $\chi^2$ value found is approximately $7.697606186049128$. With 12 degrees
|
||||
of freedom our $p$-value will be $0.8082925814979871$
|
||||
|
||||
\textbf{T-test:} A slightly different hypothesis can be tested using a T-Test:
|
||||
\\
|
||||
$H$: Players that are taller have significantly more point \\
|
||||
$H_0$: The points a player has is independent of their height \\
|
||||
|
||||
We ran this T-test twice, once for the women and once for the men, by
|
||||
splitting the groups of players into two: one being taller than the mean
|
||||
height, one being shorter than the mean height. Our T-test for the men
|
||||
revealed a T-value of 1.711723, this has a p-value of 0.043815. For the women
|
||||
the T-value found was 1.860241, which has a p-value of 0.032030.
|
||||
|
||||
\subsection{The advantage of left-handedness}
|
||||
|
||||
\textbf{$\chi^2$-test:} To test if there is an advantage of being left-handed
|
||||
we ran a $\chi^2$ with the following hypotheses:\\
|
||||
$H$: Players that are left-handed have a higher rank \\
|
||||
$H_0$: The rank of a player is independent their preferred hand \\
|
||||
\\
|
||||
To perform the test the players are groups into groups dependant on their rank
|
||||
and if they play with their left hand. The expected data is computed using the
|
||||
chances of being left-handed. The data used is found in table
|
||||
2.
|
||||
|
||||
\begin{table}[ht]
|
||||
\centering
|
||||
\label{tab:chihand}
|
||||
\begin{tabular}{|l|l|l|l|l|l|}
|
||||
\hline
|
||||
& \textbf{1 - 99} & \textbf{100 - 199} & \textbf{200 - 299} & \textbf{300 - 399} & \textbf{400 - 499} \\
|
||||
\hline
|
||||
\textbf{L} & 22 / 21 & 23 / 18 & 17 / 15 & 6 / 12 & 8 / 10 \\
|
||||
\textbf{R} & 174 / 177 & 139 / 144 & 117 / 119 &
|
||||
105 / 98 & 88 / 86 \\
|
||||
\hline
|
||||
\end{tabular}
|
||||
\caption{Observed / Expected values used for the $\chi^2$-test. The groups are divided by which hand they use (vertical) and their rank (horizontal).}
|
||||
\end{table}
|
||||
|
||||
The $\chi^2$ value found is approximately $6.467312944404331$. With 4 degrees
|
||||
of freedom our $p$-value will be $0.1668616190847413$
|
||||
|
||||
\textbf{T-test:} A slightly different hypothesis can be tested using a T-Test:
|
||||
\\
|
||||
$H$: Players that are left-handed have significantly more point \\
|
||||
$H_0$: The points a player has is independent of their preferred hand \\
|
||||
|
||||
We ran this T-test by splitting the groups of players into two depending on
|
||||
their preferred hand. Our T-test revealed a T-value of 0.451694,
|
||||
this has a p-value of 0.325815.
|
||||
|
||||
|
||||
\section{Discussion} \label{sec:discussion}
|
||||
|
||||
In our investigation we did not find any strong correlation between the
|
||||
ranking of a player (or their number of points) and with which hand they
|
||||
played or how tall they are. Most tests failed to pass the required p value of
|
||||
$<0.05$. The only tests that did give us positive results are the T-test that
|
||||
were conducted on the correlation between height and the number of points.
|
||||
However, without the $\chi^2$-test confirming the correlation, the existence
|
||||
of the correlation is questionable.
|
||||
|
||||
These results might not be so surprising when the visual exploration is taken
|
||||
into account. Only slight deviations are visible in our graphs, so the test
|
||||
mainly confirmed our suspicion that no definitive correlation exists between
|
||||
the different attributes.
|
||||
|
||||
\end{document}
|
762
wk7/wk7.ipynb
BIN
wk8/A1_data.xlsx
Normal file
BIN
wk8/distr.png
Normal file
After Width: | Height: | Size: 842 KiB |
BIN
wk8/effect.png
Normal file
After Width: | Height: | Size: 876 KiB |
10
wk8/references.bib
Normal file
@ -0,0 +1,10 @@
|
||||
@article{dong2018methods,
|
||||
title={Methods for quantifying effects of social unrest using credit card transaction data},
|
||||
author={Dong, Xiaowen and Meyer, Joachim and Shmueli, Erez and Bozkaya, Bur{\c{c}}in and Pentland, Alex},
|
||||
journal={EPJ Data Science},
|
||||
volume={7},
|
||||
number={1},
|
||||
pages={8},
|
||||
year={2018},
|
||||
publisher={Springer}
|
||||
}
|
187
wk8/week8.tex
Normal file
@ -0,0 +1,187 @@
|
||||
\documentclass[a4paper]{article}
|
||||
% To compile PDF run: latexmk -pdf {filename}.tex
|
||||
|
||||
% Math package
|
||||
\usepackage{amsmath}
|
||||
%enable \cref{...} and \Cref{...} instead of \ref: Type of reference included in the link
|
||||
\usepackage[capitalise,nameinlink]{cleveref}
|
||||
% Enable that parameters of \cref{}, \ref{}, \cite{}, ... are linked so that a reader can click on the number an jump to the target in the document
|
||||
\usepackage{hyperref}
|
||||
% UTF-8 encoding
|
||||
\usepackage[T1]{fontenc}
|
||||
\usepackage[utf8]{inputenc} %support umlauts in the input
|
||||
% Easier compilation
|
||||
\usepackage{bookmark}
|
||||
\usepackage{natbib}
|
||||
\usepackage{graphicx}
|
||||
|
||||
\begin{document}
|
||||
\title{Week 8 - Quantitative data analysis}
|
||||
\author{
|
||||
Jai Bheeman \and Kelvin Davis \and Jip J. Dekker \and Nelson Frew \and Tony
|
||||
Silvestere
|
||||
}
|
||||
\maketitle
|
||||
|
||||
\section{Introduction} \label{sec:introduction}
|
||||
|
||||
The purpose of this report is to re-analyse the data presented in the paper by
|
||||
\cite{dong2018methods}, which investigates the effect that protests (as an
|
||||
example of disruptive social behaviours in general) have on consumer
|
||||
behaviours. \cite{dong2018methods} hypothesise that protests decrease
|
||||
consumer behaviour in the surrounding area of the event, and suggest that
|
||||
consumer spending could be used as an additional non-traditional economic
|
||||
indicator and as a gauge of consumer sentiment. Consumer spending was analysed
|
||||
using credit card transaction data from a metropolitan area within a country
|
||||
that is part of The Organisation for Economic Co-operation and Development
|
||||
(OECD). Although \cite{dong2018methods} investigate temporal and spatial
|
||||
effects on consumer spending, for the purposes of this analysis, only the
|
||||
spatial effect of variables (with relation to the geographical distance from
|
||||
the event) is considered.
|
||||
|
||||
\section{Method} \label{sec:method}
|
||||
|
||||
The dataset consists of variables measured as a function of the distance
|
||||
from the event (in km), including: the number of customers, the median
|
||||
spending amount, the number of transactions, and the total sales amount.
|
||||
The re-analysis is conducted on the data provided in the
|
||||
paper\cite{dong2018methods}, using Python in conjunction with packages such as
|
||||
pandas, matplotlib, numpy and seaborn, to process and visualise the data. As
|
||||
aforementioned, only spatial data and the variables mentioned above are
|
||||
considered, for the reference days and the change occuring Day 62 (day of
|
||||
first socially disruptive event). The distribution of the difference between
|
||||
the reference period and Day 62 is visualised by plotting a histogram for each
|
||||
variable. Since the decrease of each the variables from the reference period
|
||||
to Day 62 is provided, the mean and the median of these distributions can be
|
||||
used to perform a one-sample (as we have are given the difference) hypothesis
|
||||
test to assess whether the protests on Day 62 had a discernable effect.
|
||||
|
||||
Assuming the mean of each variable over the reference period is the midpoint
|
||||
between their respective maximum and minimum values, we can reconstruct
|
||||
approximate actual values for Day 62 (given the decrease in value on Day 62
|
||||
from the reference period). By comparing these value to the range over the
|
||||
reference period, another assessment can be made to determine whether the data
|
||||
presents a discernible effect on consumer spending as a result of social
|
||||
discuption, scaling with distance.
|
||||
|
||||
Although time series data was not explicitely provided, by extrapolating
|
||||
information from a graph in \cite{dong2018methods} we can quantify the decrease
|
||||
in number of customers and median spending on Day 62 using information about the
|
||||
reference days (from 43 to 61). After collecting the values for each of the
|
||||
reference days (43-61), the mean and standard deviation of this sample can be
|
||||
calculated. Assuming a normal distribution of the data, we can calculate a
|
||||
z-score for each observation on Day 62, and use this to assess the original
|
||||
hypothesis.
|
||||
|
||||
By performing each of the above test, a re-analysis will be conducted on
|
||||
\cite{dong2018methods}'s paper hypothesising that consumer spending decreases
|
||||
as a result of social events such as protests. In the Results section, we will
|
||||
perform the statistical analyses described above. The results of these tests
|
||||
will then be explored in the Discussion section, along with assumptions and
|
||||
limitations of the tests and what can be conclused from them.
|
||||
|
||||
\section{Results} \label{sec:results}
|
||||
|
||||
For each of the variables in the given data (number of customers, median
|
||||
spending amount, number of transactions, and sales totals) we construct a
|
||||
histogram of the decrease of each (on Day 62). We then compute the mean and
|
||||
median of the data so we can proceed to perform a one-sample hypothesis test.
|
||||
|
||||
\begin{figure}[ht]
|
||||
\centering
|
||||
\label{fig:distr}
|
||||
\includegraphics[width=\textwidth]{distr.png}
|
||||
\caption{Distribution of each of the variables recorded in the data, as a function of the distance from an event}
|
||||
\end{figure}
|
||||
|
||||
Using a mean/median of the reference period, obtained by taking the midpoint of the minimum and maximum values over for each distance measure, a value can be reconstructed for the measurement on Day 62 (for each location) using:
|
||||
|
||||
\begin{equation}
|
||||
\textrm{value} = \frac{\textrm{min} + \text{max}}{2} - \textrm{decrease.}
|
||||
\tag{1}
|
||||
\end{equation}
|
||||
\\
|
||||
We can then plot the maximum and minimum values for the reference period, as well as the reconstructed Day 62 variables to observe the behaviour of consumer spending after the event.
|
||||
|
||||
\begin{figure}[ht]
|
||||
\centering
|
||||
\label{fig:effect}
|
||||
\includegraphics[width=\textwidth]{effect.png}
|
||||
\caption{The reconstructed values for Day 62 of each variable plotted against their respective minimums and maximums over the reference period}
|
||||
\end{figure}
|
||||
|
||||
Using the data recorded, for each of the three distance recorded, the mean and standard deviation of the reference period can be calculated. The z-score for each observed value on Day 62 can be computed using:
|
||||
|
||||
\begin{equation}
|
||||
\textrm{Z} = \frac{\textrm{X} - \mu}{\sigma},
|
||||
\tag{2}
|
||||
\end{equation}
|
||||
\\
|
||||
where X is the observed value, $\mu$ and $\sigma$ are the mean and standard deviation (respectively) of the reference period.
|
||||
|
||||
\begin{table}[ht]
|
||||
\centering
|
||||
\label{my-label}
|
||||
\begin{tabular}{|l|l|r|r|}
|
||||
\hline
|
||||
\textbf{Variable} & \textbf{Distance} & \textbf{X} & \textbf{Z} \\
|
||||
\hline
|
||||
\textbf{Customers} & \textless 2km & -0.600 & 6.87798 \\
|
||||
\textbf{Customers} & 2km - 4km & -0.200 & -3.33253 \\
|
||||
\textbf{Customers} & \textgreater 4km & -0.100 & -3.70740 \\
|
||||
\textbf{Median Spending} & \textless 2km & -0.200 & -3.05849 \\
|
||||
\textbf{Median Spending} & 2km - 4km & -0.100 & -1.46508 \\
|
||||
\textbf{Median Spending} & \textgreater 4km & -0.035 & -1.99199 \\
|
||||
\hline
|
||||
\end{tabular}
|
||||
\caption{The $Z$ score computed using equation 2 and the temporal data}
|
||||
\end{table}
|
||||
|
||||
\section{Discussion} \label{sec:discussion}
|
||||
|
||||
As shown in each of the subplots of Figure 1, the mean and median values of
|
||||
the decrease in each of the distributions are greater than zero (note: higher
|
||||
values of the decrease variable indicate a larger decrease/negative change).
|
||||
These mean and median values can be used to perform a one-sample hypothesis
|
||||
tests, which finds that since each of the mean/median values is greater than
|
||||
zero, we can infer that the event had a net decreasing affect on the number of
|
||||
customers, median spending amount, number of transactions, and total sales
|
||||
amount.
|
||||
|
||||
In Figure \ref{fig:effect} values were approximated for each variable on Day
|
||||
62, using Equation 1, and plotted against the minimum and maximum values of
|
||||
the respective variables. This allows us to visually assess whether the
|
||||
reconstructed value for Day 62 lies outside the range of recorded values for
|
||||
the reference period, and presents uncharacteristic behaviour. A decrease is
|
||||
evident in each of the variables after the event has occurred (on Day 62)
|
||||
within a distance of approximately 2 km, and appears to stabilise thereafter.
|
||||
This provides support to \cite{dong2018methods}'s hypothesis that consumer
|
||||
spending is affected by socially disruptive events, and also provides evidence
|
||||
to the notion of spatial scaling of this effect (based on the event location).
|
||||
It is important to note that the approximation used in this technique is
|
||||
subject to a level of error due to the ideal calculation of the mean/median of
|
||||
the reference data as the midpoint between the minimum and maximum values
|
||||
provided.
|
||||
|
||||
Extrapolating data from a graph in \cite{dong2018methods} provided time series
|
||||
data (divided into three radius') to analyse. This data was collected by
|
||||
visually estimating the values from the graph which will inherently introduce
|
||||
a source of error. However, by computing the z-score as described in Equation
|
||||
2, the table provided in Figure 3 was constructed. Each of the z-score values
|
||||
in the table are negative, indicating a decrease in both the number of
|
||||
customers and median spending on Day 62. The much larger magnitude of z-scores
|
||||
for the <2km distance ring for both variables is in agreement with earlier
|
||||
discussion, strengthening the hypothesis of the spatial correlation of
|
||||
consumer spending.
|
||||
|
||||
Each of the above tests have agreed on the spatial and temporal correlation of
|
||||
consumer spending and socially disruptive events. With the limited data
|
||||
available, we can therefore concur with the hypothesis of Dong et al. that
|
||||
consumer spending decreases in the area around disruptive social behaviour,
|
||||
after finding the temporal correlation on Day 62, as well as the spatially
|
||||
decreasing effect further from the event.
|
||||
|
||||
\bibliographystyle{humannat}
|
||||
\bibliography{references}
|
||||
|
||||
\end{document}
|
639
wk8/wk8.ipynb
27
wk9/FIT4005_Wk9_Report
Normal file
@ -0,0 +1,27 @@
|
||||
/* NOTE! This has not been proofread */
|
||||
|
||||
What we did (Method)
|
||||
Provided with a set of 132 unique records of the top 200 male tennis players, we sought to investigate the relationship between the height of particular individuals with their respective weights. We conducted basic statistical correlation analyses of the two variables with both Pearson's and Spearman's correlation coefficients to achieve this. Further, to understand the correlations more deeply, we carried out these correlation tests on the full population of cleaned data (removed duplicates etc), alongside several random samples and samples of ranking ranges within the top 200. To this end, we made use of Microsoft Excel tools and functions of the Python library SciPy.
|
||||
|
||||
What we got (Results)
|
||||
We performed seperate statistical analyses on 10 different samples of the population, as well as the population itself. This included 5 separate subsets of the rankings (top 20 and 50, middle 20, bottom 20 and 50) and 5 seperate randomly chosen samples of 20 players.
|
||||
|
||||
The results for the tests is as follows (all data is rounded to 5 decimal places):
|
||||
|
||||
Test Set Pearson's Coefficient Spearman's Coefficient
|
||||
Population 0.77953 0.73925
|
||||
Top 20 0.80743 0.80345
|
||||
Middle 20 0.54134 0.36565
|
||||
Bottom 20 0.84046 0.88172
|
||||
Top 50 0.80072 0.78979
|
||||
Bottom 50 0.84237 0.81355
|
||||
Random set #1 0.84243 0.80237
|
||||
Random set #2 0.56564 0.58714
|
||||
Random set #3 0.59223 0.63662
|
||||
Random set #4 0.65091 0.58471
|
||||
Random set #5 0.86203 0.77832
|
||||
|
||||
|
||||
What this says (Discussion)
|
||||
|
||||
The results generally indicate that there is a fairly strong positive correlation between the weight and weight of an individual tennis player, within the top 200 male players. The population maintains a strong positive correlation with both Pearson's and Spearman's correlation coefficients, indicating that a relationship may exist. Our population samples show promising consistency with this, with 6 seperate samples having values above 0.6 with both techniques. The sample taken from the middle 20 players, however, shows a relatively weaker correlation compared with the top 20 and middle 20, which provides some insight into the distribution of the strongest correlated heights and weights amongst the rankings. All five random samples of 20 taken from the population indicate however that there does appear to be a consistent trend through the population, which corresponds accurately with the coefficients on the general population.
|
BIN
wk9/Tennis players 2017-09 final.xlsx
Normal file
BIN
wk9/pearson.png
Normal file
After Width: | Height: | Size: 90 KiB |
BIN
wk9/spearman.png
Normal file
After Width: | Height: | Size: 88 KiB |
113
wk9/week9.tex
Normal file
@ -0,0 +1,113 @@
|
||||
\documentclass[a4paper]{article}
|
||||
% To compile PDF run: latexmk -pdf {filename}.tex
|
||||
|
||||
% Math package
|
||||
\usepackage{amsmath}
|
||||
%enable \cref{...} and \Cref{...} instead of \ref: Type of reference included in the link
|
||||
\usepackage[capitalise,nameinlink]{cleveref}
|
||||
% Enable that parameters of \cref{}, \ref{}, \cite{}, ... are linked so that a reader can click on the number an jump to the target in the document
|
||||
\usepackage{hyperref}
|
||||
% UTF-8 encoding
|
||||
\usepackage[T1]{fontenc}
|
||||
\usepackage[utf8]{inputenc} %support umlauts in the input
|
||||
% Easier compilation
|
||||
\usepackage{bookmark}
|
||||
\usepackage{graphicx}
|
||||
|
||||
\begin{document}
|
||||
\title{Week 9 - Correlation and Regression}
|
||||
\author{
|
||||
Jai Bheeman \and Kelvin Davis \and Jip J. Dekker \and Nelson Frew \and Tony
|
||||
Silvestere
|
||||
}
|
||||
\maketitle
|
||||
|
||||
\section{Introduction} \label{sec:introduction}
|
||||
We present a report on the relationship between the heights and weights of the
|
||||
top tennis players as catalogued in provided data. We use statistical analysis
|
||||
techniques to numerically describe the characteristics of the data, to see how
|
||||
trends are exhibited within the data set. We conclude the report with a brief
|
||||
discussion of the implications of the analysis and provide insights on
|
||||
potential correlations that may exist.
|
||||
|
||||
\section{Method} \label{sec:method}
|
||||
Provided with a set of 132 unique records of the top 200 male tennis players,
|
||||
we sought to investigate the relationship between the height of particular
|
||||
individuals with their respective weights. We conducted basic statistical
|
||||
correlation analyses of the two variables with both Pearson's and Spearman's
|
||||
correlation coefficients to achieve this. Further, to understand the
|
||||
correlations more deeply, we carried out these correlation tests on the full
|
||||
population of cleaned data (removed duplicates etc), alongside several random
|
||||
samples and samples of ranking ranges within the top 200. To this end, we made
|
||||
use of Microsoft Excel tools and functions of the Python library SciPy.
|
||||
|
||||
We specifically have made use of these separate statistical analysis tools in the
|
||||
interest of sanity checking our findings. To do this, we simply replicated the
|
||||
correlation tests within other software environments.
|
||||
|
||||
\section{Results} \label{sec:results}
|
||||
We performed separate statistical analyses on 10 different samples of the
|
||||
population, as well as the population itself. This included 11 separate
|
||||
subsets of the rankings:
|
||||
\begin{itemize}
|
||||
\item The top 20 entries
|
||||
\item The middle 20 entries
|
||||
\item The bottom 20 entries
|
||||
\item The top 50 entries
|
||||
\item The bottom 50 entries
|
||||
\item 5 randomly chosen sets of 20 entries
|
||||
\end{itemize}
|
||||
\vspace{1em}
|
||||
Table \ref{tab:excel_results} shows the the results for the conducted tests.
|
||||
|
||||
\begin{table}[ht]
|
||||
\centering
|
||||
\label{tab:excel_results}
|
||||
\begin{tabular}{|l|r|r|}
|
||||
\hline
|
||||
\textbf{Test Set} & \textbf{Pearson's Coefficient} & \textbf{Spearman's Coefficient} \\
|
||||
\hline
|
||||
\textbf{Full Population} & 0.77953 & 0.73925 \\
|
||||
\textbf{Top 20} & 0.80743 & 0.80345 \\
|
||||
\textbf{Middle 20} & 0.54134 & 0.36565 \\
|
||||
\textbf{Bottom 20} & 0.84046 & 0.88172 \\
|
||||
\textbf{Top 50} & 0.80072 & 0.78979 \\
|
||||
\textbf{Bottom 50} & 0.84237 & 0.81355 \\
|
||||
\textbf{Random Set \#1} & 0.84243 & 0.80237 \\
|
||||
\textbf{Random Set \#2} & 0.56564 & 0.58714 \\
|
||||
\textbf{Random Set \#3} & 0.59223 & 0.63662 \\
|
||||
\textbf{Random Set \#4} & 0.65091 & 0.58471 \\
|
||||
\textbf{Random Set \#5} & 0.86203 & 0.77832
|
||||
\\ \hline
|
||||
\end{tabular}
|
||||
\caption{Table showing the correlation coefficients between height and
|
||||
weight using different test sets. All data is rounded to 5 decimal
|
||||
places}
|
||||
\end{table}
|
||||
|
||||
\begin{figure}[ht]
|
||||
\centering
|
||||
\label{fig:scipy}
|
||||
\includegraphics[width=0.6\textwidth]{pearson.png}
|
||||
\includegraphics[width=0.6\textwidth]{spearman.png}
|
||||
\caption{The Pearsion (top) and Spearman (bottom) correlations coefficients
|
||||
of the data set as computed by the Pandas Python library}
|
||||
\end{figure}
|
||||
|
||||
\section{Discussion} \label{sec:discussion}
|
||||
The results generally indicate that there is a fairly strong positive
|
||||
correlation between the weight and weight of an individual tennis player,
|
||||
within the top 200 male players. The population maintains a strong positive
|
||||
correlation with both Pearson's and Spearman's correlation coefficients,
|
||||
indicating that a relationship may exist. Our population samples show
|
||||
promising consistency with this, with 6 seperate samples having values above
|
||||
0.6 with both techniques. The sample taken from the middle 20 players,
|
||||
however, shows a relatively weaker correlation compared with the top 20 and
|
||||
middle 20, which provides some insight into the distribution of the strongest
|
||||
correlated heights and weights amongst the rankings. All five random samples
|
||||
of 20 taken from the population indicate however that there does appear to be
|
||||
a consistent trend through the population, which corresponds accurately with
|
||||
the coefficients on the general population.
|
||||
|
||||
|
||||
\end{document}
|
252
wk9/wk9.ipynb
Normal file
@ -0,0 +1,252 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Using matplotlib backend: MacOSX\n",
|
||||
"Populating the interactive namespace from numpy and matplotlib\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"%pylab\n",
|
||||
"%matplotlib inline\n",
|
||||
"import pandas as pd\n",
|
||||
"import numpy as np\n",
|
||||
"import matplotlib.pyplot as plt\n",
|
||||
"from scipy import stats\n",
|
||||
"from matplotlib import colors\n",
|
||||
"\n",
|
||||
"data = pd.read_csv(\"Tennis players 2017-09.csv\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<style type=\"text/css\" >\n",
|
||||
" #T_7277b07a_4f3e_11e8_b8a3_787b8ab7acb2row0_col0 {\n",
|
||||
" background-color: #fc7f00;\n",
|
||||
" } #T_7277b07a_4f3e_11e8_b8a3_787b8ab7acb2row0_col1 {\n",
|
||||
" background-color: #ffd20c;\n",
|
||||
" } #T_7277b07a_4f3e_11e8_b8a3_787b8ab7acb2row0_col2 {\n",
|
||||
" background-color: #ffe619;\n",
|
||||
" } #T_7277b07a_4f3e_11e8_b8a3_787b8ab7acb2row0_col3 {\n",
|
||||
" background-color: #f1f44d;\n",
|
||||
" } #T_7277b07a_4f3e_11e8_b8a3_787b8ab7acb2row1_col0 {\n",
|
||||
" background-color: #ffd20c;\n",
|
||||
" } #T_7277b07a_4f3e_11e8_b8a3_787b8ab7acb2row1_col1 {\n",
|
||||
" background-color: #fc7f00;\n",
|
||||
" } #T_7277b07a_4f3e_11e8_b8a3_787b8ab7acb2row1_col2 {\n",
|
||||
" background-color: #e4ff7a;\n",
|
||||
" } #T_7277b07a_4f3e_11e8_b8a3_787b8ab7acb2row1_col3 {\n",
|
||||
" background-color: #e8fc6c;\n",
|
||||
" } #T_7277b07a_4f3e_11e8_b8a3_787b8ab7acb2row2_col0 {\n",
|
||||
" background-color: #ffe619;\n",
|
||||
" } #T_7277b07a_4f3e_11e8_b8a3_787b8ab7acb2row2_col1 {\n",
|
||||
" background-color: #e4ff7a;\n",
|
||||
" } #T_7277b07a_4f3e_11e8_b8a3_787b8ab7acb2row2_col2 {\n",
|
||||
" background-color: #fc7f00;\n",
|
||||
" } #T_7277b07a_4f3e_11e8_b8a3_787b8ab7acb2row2_col3 {\n",
|
||||
" background-color: #fe9800;\n",
|
||||
" } #T_7277b07a_4f3e_11e8_b8a3_787b8ab7acb2row3_col0 {\n",
|
||||
" background-color: #f1f44d;\n",
|
||||
" } #T_7277b07a_4f3e_11e8_b8a3_787b8ab7acb2row3_col1 {\n",
|
||||
" background-color: #e8fc6c;\n",
|
||||
" } #T_7277b07a_4f3e_11e8_b8a3_787b8ab7acb2row3_col2 {\n",
|
||||
" background-color: #fe9800;\n",
|
||||
" } #T_7277b07a_4f3e_11e8_b8a3_787b8ab7acb2row3_col3 {\n",
|
||||
" background-color: #fc7f00;\n",
|
||||
" }</style> \n",
|
||||
"<table id=\"T_7277b07a_4f3e_11e8_b8a3_787b8ab7acb2\" > \n",
|
||||
"<thead> <tr> \n",
|
||||
" <th class=\"blank level0\" ></th> \n",
|
||||
" <th class=\"col_heading level0 col0\" >DOB</th> \n",
|
||||
" <th class=\"col_heading level0 col1\" >RANK</th> \n",
|
||||
" <th class=\"col_heading level0 col2\" >HEIGHT</th> \n",
|
||||
" <th class=\"col_heading level0 col3\" >Weight</th> \n",
|
||||
" </tr></thead> \n",
|
||||
"<tbody> <tr> \n",
|
||||
" <th id=\"T_7277b07a_4f3e_11e8_b8a3_787b8ab7acb2level0_row0\" class=\"row_heading level0 row0\" >DOB</th> \n",
|
||||
" <td id=\"T_7277b07a_4f3e_11e8_b8a3_787b8ab7acb2row0_col0\" class=\"data row0 col0\" >1</td> \n",
|
||||
" <td id=\"T_7277b07a_4f3e_11e8_b8a3_787b8ab7acb2row0_col1\" class=\"data row0 col1\" >0.277766</td> \n",
|
||||
" <td id=\"T_7277b07a_4f3e_11e8_b8a3_787b8ab7acb2row0_col2\" class=\"data row0 col2\" >0.139684</td> \n",
|
||||
" <td id=\"T_7277b07a_4f3e_11e8_b8a3_787b8ab7acb2row0_col3\" class=\"data row0 col3\" >-0.030479</td> \n",
|
||||
" </tr> <tr> \n",
|
||||
" <th id=\"T_7277b07a_4f3e_11e8_b8a3_787b8ab7acb2level0_row1\" class=\"row_heading level0 row1\" >RANK</th> \n",
|
||||
" <td id=\"T_7277b07a_4f3e_11e8_b8a3_787b8ab7acb2row1_col0\" class=\"data row1 col0\" >0.277766</td> \n",
|
||||
" <td id=\"T_7277b07a_4f3e_11e8_b8a3_787b8ab7acb2row1_col1\" class=\"data row1 col1\" >1</td> \n",
|
||||
" <td id=\"T_7277b07a_4f3e_11e8_b8a3_787b8ab7acb2row1_col2\" class=\"data row1 col2\" >-0.16755</td> \n",
|
||||
" <td id=\"T_7277b07a_4f3e_11e8_b8a3_787b8ab7acb2row1_col3\" class=\"data row1 col3\" >-0.121946</td> \n",
|
||||
" </tr> <tr> \n",
|
||||
" <th id=\"T_7277b07a_4f3e_11e8_b8a3_787b8ab7acb2level0_row2\" class=\"row_heading level0 row2\" >HEIGHT</th> \n",
|
||||
" <td id=\"T_7277b07a_4f3e_11e8_b8a3_787b8ab7acb2row2_col0\" class=\"data row2 col0\" >0.139684</td> \n",
|
||||
" <td id=\"T_7277b07a_4f3e_11e8_b8a3_787b8ab7acb2row2_col1\" class=\"data row2 col1\" >-0.16755</td> \n",
|
||||
" <td id=\"T_7277b07a_4f3e_11e8_b8a3_787b8ab7acb2row2_col2\" class=\"data row2 col2\" >1</td> \n",
|
||||
" <td id=\"T_7277b07a_4f3e_11e8_b8a3_787b8ab7acb2row2_col3\" class=\"data row2 col3\" >0.779526</td> \n",
|
||||
" </tr> <tr> \n",
|
||||
" <th id=\"T_7277b07a_4f3e_11e8_b8a3_787b8ab7acb2level0_row3\" class=\"row_heading level0 row3\" >Weight</th> \n",
|
||||
" <td id=\"T_7277b07a_4f3e_11e8_b8a3_787b8ab7acb2row3_col0\" class=\"data row3 col0\" >-0.030479</td> \n",
|
||||
" <td id=\"T_7277b07a_4f3e_11e8_b8a3_787b8ab7acb2row3_col1\" class=\"data row3 col1\" >-0.121946</td> \n",
|
||||
" <td id=\"T_7277b07a_4f3e_11e8_b8a3_787b8ab7acb2row3_col2\" class=\"data row3 col2\" >0.779526</td> \n",
|
||||
" <td id=\"T_7277b07a_4f3e_11e8_b8a3_787b8ab7acb2row3_col3\" class=\"data row3 col3\" >1</td> \n",
|
||||
" </tr></tbody> \n",
|
||||
"</table> "
|
||||
],
|
||||
"text/plain": [
|
||||
"<pandas.io.formats.style.Styler at 0x1a197d7b38>"
|
||||
]
|
||||
},
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"def background_gradient(s, m, M, cmap='Wistia', low=0, high=0):\n",
|
||||
" rng = M - m\n",
|
||||
" norm = colors.Normalize(m - (rng * low),\n",
|
||||
" M + (rng * high))\n",
|
||||
" normed = norm(s.values)\n",
|
||||
" c = [colors.rgb2hex(x) for x in plt.cm.get_cmap(cmap)(normed)]\n",
|
||||
" return ['background-color: %s' % color for color in c]\n",
|
||||
"\n",
|
||||
"data = data[[\"SEX\", \"DOB\", \"RANK\", \"HANDED\", \"Country\", \"HEIGHT\", \"Weight\"]]\n",
|
||||
"data.drop_duplicates\n",
|
||||
"\n",
|
||||
"pearson = data.corr()\n",
|
||||
"pearson.style.apply(background_gradient,\n",
|
||||
" cmap='Wistia',\n",
|
||||
" m=pearson.min().min(),\n",
|
||||
" M=pearson.max().max()\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<style type=\"text/css\" >\n",
|
||||
" #T_727bef98_4f3e_11e8_a315_787b8ab7acb2row0_col0 {\n",
|
||||
" background-color: #fc7f00;\n",
|
||||
" } #T_727bef98_4f3e_11e8_a315_787b8ab7acb2row0_col1 {\n",
|
||||
" background-color: #ffd20c;\n",
|
||||
" } #T_727bef98_4f3e_11e8_a315_787b8ab7acb2row0_col2 {\n",
|
||||
" background-color: #fee91d;\n",
|
||||
" } #T_727bef98_4f3e_11e8_a315_787b8ab7acb2row0_col3 {\n",
|
||||
" background-color: #f4f242;\n",
|
||||
" } #T_727bef98_4f3e_11e8_a315_787b8ab7acb2row1_col0 {\n",
|
||||
" background-color: #ffd20c;\n",
|
||||
" } #T_727bef98_4f3e_11e8_a315_787b8ab7acb2row1_col1 {\n",
|
||||
" background-color: #fc7f00;\n",
|
||||
" } #T_727bef98_4f3e_11e8_a315_787b8ab7acb2row1_col2 {\n",
|
||||
" background-color: #e4ff7a;\n",
|
||||
" } #T_727bef98_4f3e_11e8_a315_787b8ab7acb2row1_col3 {\n",
|
||||
" background-color: #eafa63;\n",
|
||||
" } #T_727bef98_4f3e_11e8_a315_787b8ab7acb2row2_col0 {\n",
|
||||
" background-color: #fee91d;\n",
|
||||
" } #T_727bef98_4f3e_11e8_a315_787b8ab7acb2row2_col1 {\n",
|
||||
" background-color: #e4ff7a;\n",
|
||||
" } #T_727bef98_4f3e_11e8_a315_787b8ab7acb2row2_col2 {\n",
|
||||
" background-color: #fc7f00;\n",
|
||||
" } #T_727bef98_4f3e_11e8_a315_787b8ab7acb2row2_col3 {\n",
|
||||
" background-color: #ff9d00;\n",
|
||||
" } #T_727bef98_4f3e_11e8_a315_787b8ab7acb2row3_col0 {\n",
|
||||
" background-color: #f4f242;\n",
|
||||
" } #T_727bef98_4f3e_11e8_a315_787b8ab7acb2row3_col1 {\n",
|
||||
" background-color: #eafa63;\n",
|
||||
" } #T_727bef98_4f3e_11e8_a315_787b8ab7acb2row3_col2 {\n",
|
||||
" background-color: #ff9d00;\n",
|
||||
" } #T_727bef98_4f3e_11e8_a315_787b8ab7acb2row3_col3 {\n",
|
||||
" background-color: #fc7f00;\n",
|
||||
" }</style> \n",
|
||||
"<table id=\"T_727bef98_4f3e_11e8_a315_787b8ab7acb2\" > \n",
|
||||
"<thead> <tr> \n",
|
||||
" <th class=\"blank level0\" ></th> \n",
|
||||
" <th class=\"col_heading level0 col0\" >DOB</th> \n",
|
||||
" <th class=\"col_heading level0 col1\" >RANK</th> \n",
|
||||
" <th class=\"col_heading level0 col2\" >HEIGHT</th> \n",
|
||||
" <th class=\"col_heading level0 col3\" >Weight</th> \n",
|
||||
" </tr></thead> \n",
|
||||
"<tbody> <tr> \n",
|
||||
" <th id=\"T_727bef98_4f3e_11e8_a315_787b8ab7acb2level0_row0\" class=\"row_heading level0 row0\" >DOB</th> \n",
|
||||
" <td id=\"T_727bef98_4f3e_11e8_a315_787b8ab7acb2row0_col0\" class=\"data row0 col0\" >1</td> \n",
|
||||
" <td id=\"T_727bef98_4f3e_11e8_a315_787b8ab7acb2row0_col1\" class=\"data row0 col1\" >0.280386</td> \n",
|
||||
" <td id=\"T_727bef98_4f3e_11e8_a315_787b8ab7acb2row0_col2\" class=\"data row0 col2\" >0.122412</td> \n",
|
||||
" <td id=\"T_727bef98_4f3e_11e8_a315_787b8ab7acb2row0_col3\" class=\"data row0 col3\" >0.00769861</td> \n",
|
||||
" </tr> <tr> \n",
|
||||
" <th id=\"T_727bef98_4f3e_11e8_a315_787b8ab7acb2level0_row1\" class=\"row_heading level0 row1\" >RANK</th> \n",
|
||||
" <td id=\"T_727bef98_4f3e_11e8_a315_787b8ab7acb2row1_col0\" class=\"data row1 col0\" >0.280386</td> \n",
|
||||
" <td id=\"T_727bef98_4f3e_11e8_a315_787b8ab7acb2row1_col1\" class=\"data row1 col1\" >1</td> \n",
|
||||
" <td id=\"T_727bef98_4f3e_11e8_a315_787b8ab7acb2row1_col2\" class=\"data row1 col2\" >-0.160006</td> \n",
|
||||
" <td id=\"T_727bef98_4f3e_11e8_a315_787b8ab7acb2row1_col3\" class=\"data row1 col3\" >-0.0908714</td> \n",
|
||||
" </tr> <tr> \n",
|
||||
" <th id=\"T_727bef98_4f3e_11e8_a315_787b8ab7acb2level0_row2\" class=\"row_heading level0 row2\" >HEIGHT</th> \n",
|
||||
" <td id=\"T_727bef98_4f3e_11e8_a315_787b8ab7acb2row2_col0\" class=\"data row2 col0\" >0.122412</td> \n",
|
||||
" <td id=\"T_727bef98_4f3e_11e8_a315_787b8ab7acb2row2_col1\" class=\"data row2 col1\" >-0.160006</td> \n",
|
||||
" <td id=\"T_727bef98_4f3e_11e8_a315_787b8ab7acb2row2_col2\" class=\"data row2 col2\" >1</td> \n",
|
||||
" <td id=\"T_727bef98_4f3e_11e8_a315_787b8ab7acb2row2_col3\" class=\"data row2 col3\" >0.739246</td> \n",
|
||||
" </tr> <tr> \n",
|
||||
" <th id=\"T_727bef98_4f3e_11e8_a315_787b8ab7acb2level0_row3\" class=\"row_heading level0 row3\" >Weight</th> \n",
|
||||
" <td id=\"T_727bef98_4f3e_11e8_a315_787b8ab7acb2row3_col0\" class=\"data row3 col0\" >0.00769861</td> \n",
|
||||
" <td id=\"T_727bef98_4f3e_11e8_a315_787b8ab7acb2row3_col1\" class=\"data row3 col1\" >-0.0908714</td> \n",
|
||||
" <td id=\"T_727bef98_4f3e_11e8_a315_787b8ab7acb2row3_col2\" class=\"data row3 col2\" >0.739246</td> \n",
|
||||
" <td id=\"T_727bef98_4f3e_11e8_a315_787b8ab7acb2row3_col3\" class=\"data row3 col3\" >1</td> \n",
|
||||
" </tr></tbody> \n",
|
||||
"</table> "
|
||||
],
|
||||
"text/plain": [
|
||||
"<pandas.io.formats.style.Styler at 0x111a3b198>"
|
||||
]
|
||||
},
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"spearman = data.corr(method=\"spearman\")\n",
|
||||
"spearman.style.apply(background_gradient,\n",
|
||||
" cmap='Wistia',\n",
|
||||
" m=spearman.min().min(),\n",
|
||||
" M=spearman.max().max()\n",
|
||||
")"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.6.4"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|