Merge branch 'master' of https://github.com/Dekker1/ResearchMethods

2018-05-25 13:43:49 +10:00 · 2018-05-25 13:43:49 +10:00 · abb5c60939
commit abb5c60939
parent a9d1a73bc1 ab59f456e2
3 changed files with 112 additions and 14 deletions
--- a/mini_proj/report/references.bib
+++ b/mini_proj/report/references.bib
@ -1,8 +1,8 @@
-@misc{openData, 
-  title={Open Database License (ODbL) v1.0}, 
-  url={https://opendatacommons.org/licenses/odbl/1.0/}, 
-  journal={Open Data Commons}, 
-  year={2018}, 
+@misc{openData,
+  title={Open Database License (ODbL) v1.0},
+  url={https://opendatacommons.org/licenses/odbl/1.0/},
+  journal={Open Data Commons},
+  year={2018},
  month={Feb}
 }
@techreport{knn,
@ -21,6 +21,14 @@
  year={1995},
  publisher={Springer}
 }
+@inproceedings{svmnonlinear,
+  title={A training algorithm for optimal margin classifiers},
+  author={Boser, Bernhard E and Guyon, Isabelle M and Vapnik, Vladimir N},
+  booktitle={Proceedings of the fifth annual workshop on Computational learning theory},
+  pages={144--152},
+  year={1992},
+  organization={ACM}
+}
@article{naivebayes,
  title={Idiot's Bayes—not so stupid after all?},
  author={Hand, David J and Yu, Keming},
--- a/mini_proj/report/waldo.tex
+++ b/mini_proj/report/waldo.tex
@ -24,11 +24,23 @@

 	\begin{document}
 		\title{What is Waldo?}
-		\author{Kelvin Davis \and Jip J. Dekker\and Anthony Silvestere}
+		\author{Kelvin Davis \and Jip J. Dekker \and Anthony Silvestere}
 		\maketitle

 		\begin{abstract}
-
+%
+		The famous brand of picture puzzles ``Where's Waldo?'' relates well to many
+		unsolved image classification problem. This offers us the opportunity to
+		test different image classification methods on a data set that is both small
+		enough to compute in a reasonable time span and easy for humans to
+		understand. In this report we compare the well known machine learning
+		methods Naive Bayes, Support Vector Machines, $k$-Nearest Neighbors, and
+		Random Forest against the Neural Network Architectures LeNet, Fully
+		Convolutional Neural Networks, and Fully Convolutional Neural Networks.
+		\todo{I don't like this big summation but I think it is the important
+		information}
+		Our comparison shows that \todo{...}
+%
 		\end{abstract}

 		\section{Introduction}
@ -87,7 +99,7 @@
 		architectures, as this method is currently the most used for image
 		classification.

-		\textbf{
+		\todo{
 		\\A couple of papers that may be useful (if needed):
 		- LeNet: http://yann.lecun.com/exdb/publis/pdf/lecun-01a.pdf
 		- AlexNet: http://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks
@ -106,7 +118,17 @@

 		\paragraph{Naive Bayes Classifier}

-		\cite{naivebayes}
+		\cite{naivebayes} is a classification method according to Bayes' theorem,
+		shown in \Cref{eq:bayes}. Bayes' theorem allows us to calculate the
+		probability of an event taking into account prior knowledge of conditions of
+		the event in question. In classification this allows us to calculate the
+		probability that a new instance has a certain class based its features. We
+		then assign the class that has the highest probability.
+
+		\begin{equation}
+			\label{eq:bayes}
+			P(A\mid B)=\frac {P(B\mid A)\,P(A)}{P(B)}
+		\end{equation}

 		\paragraph{$k$-Nearest Neighbors}

@ -120,11 +142,26 @@

 		\paragraph{Support Vector Machine}

-		\cite{svm}
+		(SVM) \cite{svm} has been very successful in many classification tasks. The
+		method is based on finding boundaries between the different classes. The
+		boundaries are defined as functions on the features of the instances. The
+		boundaries are optimized to have the most amount of space between the
+		boundaries and the training instances on both sides. Originally the
+		boundaries where linear functions, but more recent development allows for
+		the training of non-linear boundaries~\cite{svmnonlinear}. Once the training
+		has defined the boundaries new instances are classified according to on
+		which side of the boundary they belong.

 		\paragraph{Random Forest}

-		\cite{randomforest}
+		\cite{randomforest} is a method that is based on classifications decision
+		trees. In a decision tree a new instances is classified by going down a
+		(binary) tree. Each non-leaf node contain a selection criteria to its
+		branches. Every leaf node contains the class that will be assigned to the
+		instance if the node is reached. In other training methods, decision trees
+		have the tendency to overfit, but in random forest a multitude of decision
+		tree is trained with a certain degree of randomness and the mean of these
+		trees is used which avoids this problem.

 		\subsection{Neural Network Architectures}
 		\tab There are many well established architectures for Neural Networks depending on the task being performed. 
@ -238,9 +275,6 @@

 		\clearpage          % Ensures that the references are on a seperate page
 		\pagebreak
-		% References
-		\section{References}
-		\renewcommand{\refname}{}
 		\bibliographystyle{alpha}
 		\bibliography{references}
 	\end{document}
--- a/mini_proj/traditionals.py
+++ b/mini_proj/traditionals.py
@ -0,0 +1,56 @@
+import numpy as np
+import time as t
+from sklearn import svm, ensemble, naive_bayes, neighbors
+from _image_classifier import ImageClassifier
+
+def precision(y_true, y_pred):
+    y_pred = np.round(y_pred)
+    num = np.sum(np.logical_and(y_true, y_pred))
+    den = np.sum(y_pred)
+    return np.divide(num, den)
+
+def recall(y_true, y_pred):
+    y_pred = np.round(y_pred)
+    num = np.sum(np.logical_and(y_true, y_pred))
+    den = np.sum(y_true)
+    return np.divide(num, den)
+
+def f_measure(y_true, y_pred):
+    p = precision(y_true, y_pred)
+    r = recall(y_true, y_pred)
+    return 2 * p * r / (p + r)
+
+def metric_test(iclf, metric, test_X, test_Y):
+    return metric(test_Y, iclf.predict(test_X))
+
+## Open data
+im_train = np.load('Waldo_train_data.npy')
+im_test = np.load('Waldo_test_data.npy')
+
+lbl_train = np.load('Waldo_train_lbl.npy')
+lbl_test = np.load('Waldo_test_lbl.npy')
+
+# lbl_train = to_categorical(lbl_train)       # One hot encoding the labels
+# lbl_test = to_categorical(lbl_test)
+
+my_metric_test = lambda iclf, f: metric_test(iclf, f, im_test, lbl_test)
+
+# ## Define model
+svm_iclf = ImageClassifier(svm.SVC)
+tree_iclf = ImageClassifier(neighbors.KNeighborsClassifier)
+naive_bayes_iclf = ImageClassifier(naive_bayes.GaussianNB)
+ensemble_iclf = ImageClassifier(ensemble.RandomForestClassifier)
+
+classifiers = [
+    svm_iclf,
+    tree_iclf,
+    naive_bayes_iclf,
+    ensemble_iclf,
+]
+
+for clf in classifiers:
+    start = t.time()                            # Records time before training
+    clf.fit(im_train, lbl_train)
+    end = t.time()                              # Records time after tranining
+    print("training time:", end-start)
+    print(clf.score(im_test, lbl_test))