From 98d096af4933b491061676e270206b5ea54bf7ac Mon Sep 17 00:00:00 2001 From: Kelvin Davis <273degreeskelvin@gmail.com> Date: Fri, 25 May 2018 11:30:38 +1000 Subject: [PATCH 1/6] Added traditional ml script --- mini_proj/traditionals.py | 56 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) create mode 100644 mini_proj/traditionals.py diff --git a/mini_proj/traditionals.py b/mini_proj/traditionals.py new file mode 100644 index 0000000..77ede83 --- /dev/null +++ b/mini_proj/traditionals.py @@ -0,0 +1,56 @@ +import numpy as np +import time as t +from sklearn import svm, ensemble, naive_bayes, neighbors +from _image_classifier import ImageClassifier + +def precision(y_true, y_pred): + y_pred = np.round(y_pred) + num = np.sum(np.logical_and(y_true, y_pred)) + den = np.sum(y_pred) + return np.divide(num, den) + +def recall(y_true, y_pred): + y_pred = np.round(y_pred) + num = np.sum(np.logical_and(y_true, y_pred)) + den = np.sum(y_true) + return np.divide(num, den) + +def f_measure(y_true, y_pred): + p = precision(y_true, y_pred) + r = recall(y_true, y_pred) + return 2 * p * r / (p + r) + +def metric_test(iclf, metric, test_X, test_Y): + return metric(test_Y, iclf.predict(test_X)) + +## Open data +im_train = np.load('Waldo_train_data.npy') +im_test = np.load('Waldo_test_data.npy') + +lbl_train = np.load('Waldo_train_lbl.npy') +lbl_test = np.load('Waldo_test_lbl.npy') + +# lbl_train = to_categorical(lbl_train) # One hot encoding the labels +# lbl_test = to_categorical(lbl_test) + +my_metric_test = lambda iclf, f: metric_test(iclf, f, im_test, lbl_test) + +# ## Define model +svm_iclf = ImageClassifier(svm.SVC) +tree_iclf = ImageClassifier(neighbors.KNeighborsClassifier) +naive_bayes_iclf = ImageClassifier(naive_bayes.GaussianNB) +ensemble_iclf = ImageClassifier(ensemble.RandomForestClassifier) + +classifiers = [ + svm_iclf, + tree_iclf, + naive_bayes_iclf, + ensemble_iclf, +] + +for clf in classifiers: + start = t.time() # Records time before training + clf.fit(im_train, lbl_train) + end = t.time() # Records time after tranining + print("training time:", end-start) + print(clf.score(im_test, lbl_test)) From 1ef305861d931a98b410371c6d61b959d8e365b7 Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Fri, 25 May 2018 12:13:57 +1000 Subject: [PATCH 2/6] Add initial abstract --- mini_proj/report/waldo.tex | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/mini_proj/report/waldo.tex b/mini_proj/report/waldo.tex index 2b101de..b4b1e81 100644 --- a/mini_proj/report/waldo.tex +++ b/mini_proj/report/waldo.tex @@ -24,11 +24,23 @@ \begin{document} \title{What is Waldo?} - \author{Kelvin Davis \and Jip J. Dekker\and Anthony Silvestere} + \author{Kelvin Davis \and Jip J. Dekker \and Anthony Silvestere} \maketitle \begin{abstract} - +% + The famous brand of picture puzzles ``Where's Waldo?'' relates well to many + unsolved image classification problem. This offers us the opportunity to + test different image classification methods on a data set that is both small + enough to compute in a reasonable time span and easy for humans to + understand. In this report we compare the well known machine learning + methods Naive Bayes, Support Vector Machines, $k$-Nearest Neighbors, and + Random Forest against the Neural Network Architectures LeNet, Fully + Convolutional Neural Networks, and Fully Convolutional Neural Networks. + \todo{I don't like this big summation but I think it is the important + information} + Our comparison shows that \todo{...} +% \end{abstract} \section{Introduction} From e4cf37d25a4d1e54788d0818ea2b7990fcdb3671 Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Fri, 25 May 2018 12:32:39 +1000 Subject: [PATCH 3/6] Naive Bayes description --- mini_proj/report/waldo.tex | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/mini_proj/report/waldo.tex b/mini_proj/report/waldo.tex index 8e21bbe..d101572 100644 --- a/mini_proj/report/waldo.tex +++ b/mini_proj/report/waldo.tex @@ -118,7 +118,17 @@ \paragraph{Naive Bayes Classifier} - \cite{naivebayes} + \cite{naivebayes} is a classification method according to Bayes' theorem, + shown in \Cref{eq:bayes}. Bayes' theorem allows us to calculate the + probability of an event taking into account prior knowledge of conditions of + the event in question. In classification this allows us to calculate the + probability that a new instance has a certain class based its features. We + then assign the class that has the highest probability. + + \begin{equation} + \label{eq:bayes} + P(A\mid B)=\frac {P(B\mid A)\,P(A)}{P(B)} + \end{equation} \paragraph{$k$-Nearest Neighbors} From 68e636418a3fed026cc9127fe46b514bc2604d9d Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Fri, 25 May 2018 12:46:14 +1000 Subject: [PATCH 4/6] Small stylistic changes --- mini_proj/report/waldo.tex | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/mini_proj/report/waldo.tex b/mini_proj/report/waldo.tex index d101572..6f0f623 100644 --- a/mini_proj/report/waldo.tex +++ b/mini_proj/report/waldo.tex @@ -99,7 +99,7 @@ architectures, as this method is currently the most used for image classification. - \textbf{ + \todo{ \\A couple of papers that may be useful (if needed): - LeNet: http://yann.lecun.com/exdb/publis/pdf/lecun-01a.pdf - AlexNet: http://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks @@ -251,9 +251,6 @@ \clearpage % Ensures that the references are on a seperate page \pagebreak - % References - \section{References} - \renewcommand{\refname}{} \bibliographystyle{alpha} \bibliography{references} \end{document} From 558fcf084b38f130e211e7b44e402dae395a3d27 Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Fri, 25 May 2018 13:23:21 +1000 Subject: [PATCH 5/6] Add description for SVM --- mini_proj/report/references.bib | 18 +++++++++++++----- mini_proj/report/waldo.tex | 10 +++++++++- 2 files changed, 22 insertions(+), 6 deletions(-) diff --git a/mini_proj/report/references.bib b/mini_proj/report/references.bib index a6ad907..a8b643e 100644 --- a/mini_proj/report/references.bib +++ b/mini_proj/report/references.bib @@ -1,8 +1,8 @@ -@misc{openData, - title={Open Database License (ODbL) v1.0}, - url={https://opendatacommons.org/licenses/odbl/1.0/}, - journal={Open Data Commons}, - year={2018}, +@misc{openData, + title={Open Database License (ODbL) v1.0}, + url={https://opendatacommons.org/licenses/odbl/1.0/}, + journal={Open Data Commons}, + year={2018}, month={Feb} } @techreport{knn, @@ -21,6 +21,14 @@ year={1995}, publisher={Springer} } +@inproceedings{svmnonlinear, + title={A training algorithm for optimal margin classifiers}, + author={Boser, Bernhard E and Guyon, Isabelle M and Vapnik, Vladimir N}, + booktitle={Proceedings of the fifth annual workshop on Computational learning theory}, + pages={144--152}, + year={1992}, + organization={ACM} +} @article{naivebayes, title={Idiot's Bayes—not so stupid after all?}, author={Hand, David J and Yu, Keming}, diff --git a/mini_proj/report/waldo.tex b/mini_proj/report/waldo.tex index 6f0f623..6ea06d2 100644 --- a/mini_proj/report/waldo.tex +++ b/mini_proj/report/waldo.tex @@ -142,7 +142,15 @@ \paragraph{Support Vector Machine} - \cite{svm} + (SVM) \cite{svm} has been very successful in many classification tasks. The + method is based on finding boundaries between the different classes. The + boundaries are defined as functions on the features of the instances. The + boundaries are optimized to have the most amount of space between the + boundaries and the training instances on both sides. Originally the + boundaries where linear functions, but more recent development allows for + the training of non-linear boundaries~\cite{svmnonlinear}. Once the training + has defined the boundaries new instances are classified according to on + which side of the boundary they belong. \paragraph{Random Forest} From ab59f456e2aee73da6e65e79bd292a3bd6b4d4c3 Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Fri, 25 May 2018 13:37:07 +1000 Subject: [PATCH 6/6] Add paragraph on random forest --- mini_proj/report/waldo.tex | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/mini_proj/report/waldo.tex b/mini_proj/report/waldo.tex index 6ea06d2..b5fb94e 100644 --- a/mini_proj/report/waldo.tex +++ b/mini_proj/report/waldo.tex @@ -154,7 +154,14 @@ \paragraph{Random Forest} - \cite{randomforest} + \cite{randomforest} is a method that is based on classifications decision + trees. In a decision tree a new instances is classified by going down a + (binary) tree. Each non-leaf node contain a selection criteria to its + branches. Every leaf node contains the class that will be assigned to the + instance if the node is reached. In other training methods, decision trees + have the tendency to overfit, but in random forest a multitude of decision + tree is trained with a certain degree of randomness and the mean of these + trees is used which avoids this problem. \subsection{Neural Network Architectures} \todo{Did we only do the three in the end? (Alexnet?)}