diff --git a/mini_proj/report/references.bib b/mini_proj/report/references.bib index 6b75513..0bf6a63 100644 --- a/mini_proj/report/references.bib +++ b/mini_proj/report/references.bib @@ -1,8 +1,8 @@ -@misc{openData, - title={Open Database License (ODbL) v1.0}, - url={https://opendatacommons.org/licenses/odbl/1.0/}, - journal={Open Data Commons}, - year={2018}, +@misc{openData, + title={Open Database License (ODbL) v1.0}, + url={https://opendatacommons.org/licenses/odbl/1.0/}, + journal={Open Data Commons}, + year={2018}, month={Feb} } @techreport{knn, @@ -21,6 +21,14 @@ year={1995}, publisher={Springer} } +@inproceedings{svmnonlinear, + title={A training algorithm for optimal margin classifiers}, + author={Boser, Bernhard E and Guyon, Isabelle M and Vapnik, Vladimir N}, + booktitle={Proceedings of the fifth annual workshop on Computational learning theory}, + pages={144--152}, + year={1992}, + organization={ACM} +} @article{naivebayes, title={Idiot's Bayes—not so stupid after all?}, author={Hand, David J and Yu, Keming}, diff --git a/mini_proj/report/waldo.tex b/mini_proj/report/waldo.tex index 3502b6f..3722007 100644 --- a/mini_proj/report/waldo.tex +++ b/mini_proj/report/waldo.tex @@ -24,11 +24,23 @@ \begin{document} \title{What is Waldo?} - \author{Kelvin Davis \and Jip J. Dekker\and Anthony Silvestere} + \author{Kelvin Davis \and Jip J. Dekker \and Anthony Silvestere} \maketitle \begin{abstract} - +% + The famous brand of picture puzzles ``Where's Waldo?'' relates well to many + unsolved image classification problem. This offers us the opportunity to + test different image classification methods on a data set that is both small + enough to compute in a reasonable time span and easy for humans to + understand. In this report we compare the well known machine learning + methods Naive Bayes, Support Vector Machines, $k$-Nearest Neighbors, and + Random Forest against the Neural Network Architectures LeNet, Fully + Convolutional Neural Networks, and Fully Convolutional Neural Networks. + \todo{I don't like this big summation but I think it is the important + information} + Our comparison shows that \todo{...} +% \end{abstract} \section{Introduction} @@ -87,7 +99,7 @@ architectures, as this method is currently the most used for image classification. - \textbf{ + \todo{ \\A couple of papers that may be useful (if needed): - LeNet: http://yann.lecun.com/exdb/publis/pdf/lecun-01a.pdf - AlexNet: http://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks @@ -106,7 +118,17 @@ \paragraph{Naive Bayes Classifier} - \cite{naivebayes} + \cite{naivebayes} is a classification method according to Bayes' theorem, + shown in \Cref{eq:bayes}. Bayes' theorem allows us to calculate the + probability of an event taking into account prior knowledge of conditions of + the event in question. In classification this allows us to calculate the + probability that a new instance has a certain class based its features. We + then assign the class that has the highest probability. + + \begin{equation} + \label{eq:bayes} + P(A\mid B)=\frac {P(B\mid A)\,P(A)}{P(B)} + \end{equation} \paragraph{$k$-Nearest Neighbors} @@ -120,11 +142,26 @@ \paragraph{Support Vector Machine} - \cite{svm} + (SVM) \cite{svm} has been very successful in many classification tasks. The + method is based on finding boundaries between the different classes. The + boundaries are defined as functions on the features of the instances. The + boundaries are optimized to have the most amount of space between the + boundaries and the training instances on both sides. Originally the + boundaries where linear functions, but more recent development allows for + the training of non-linear boundaries~\cite{svmnonlinear}. Once the training + has defined the boundaries new instances are classified according to on + which side of the boundary they belong. \paragraph{Random Forest} - \cite{randomforest} + \cite{randomforest} is a method that is based on classifications decision + trees. In a decision tree a new instances is classified by going down a + (binary) tree. Each non-leaf node contain a selection criteria to its + branches. Every leaf node contains the class that will be assigned to the + instance if the node is reached. In other training methods, decision trees + have the tendency to overfit, but in random forest a multitude of decision + tree is trained with a certain degree of randomness and the mean of these + trees is used which avoids this problem. \subsection{Neural Network Architectures} \tab There are many well established architectures for Neural Networks depending on the task being performed. @@ -238,9 +275,6 @@ \clearpage % Ensures that the references are on a seperate page \pagebreak - % References - \section{References} - \renewcommand{\refname}{} \bibliographystyle{alpha} \bibliography{references} \end{document} diff --git a/mini_proj/traditionals.py b/mini_proj/traditionals.py new file mode 100644 index 0000000..77ede83 --- /dev/null +++ b/mini_proj/traditionals.py @@ -0,0 +1,56 @@ +import numpy as np +import time as t +from sklearn import svm, ensemble, naive_bayes, neighbors +from _image_classifier import ImageClassifier + +def precision(y_true, y_pred): + y_pred = np.round(y_pred) + num = np.sum(np.logical_and(y_true, y_pred)) + den = np.sum(y_pred) + return np.divide(num, den) + +def recall(y_true, y_pred): + y_pred = np.round(y_pred) + num = np.sum(np.logical_and(y_true, y_pred)) + den = np.sum(y_true) + return np.divide(num, den) + +def f_measure(y_true, y_pred): + p = precision(y_true, y_pred) + r = recall(y_true, y_pred) + return 2 * p * r / (p + r) + +def metric_test(iclf, metric, test_X, test_Y): + return metric(test_Y, iclf.predict(test_X)) + +## Open data +im_train = np.load('Waldo_train_data.npy') +im_test = np.load('Waldo_test_data.npy') + +lbl_train = np.load('Waldo_train_lbl.npy') +lbl_test = np.load('Waldo_test_lbl.npy') + +# lbl_train = to_categorical(lbl_train) # One hot encoding the labels +# lbl_test = to_categorical(lbl_test) + +my_metric_test = lambda iclf, f: metric_test(iclf, f, im_test, lbl_test) + +# ## Define model +svm_iclf = ImageClassifier(svm.SVC) +tree_iclf = ImageClassifier(neighbors.KNeighborsClassifier) +naive_bayes_iclf = ImageClassifier(naive_bayes.GaussianNB) +ensemble_iclf = ImageClassifier(ensemble.RandomForestClassifier) + +classifiers = [ + svm_iclf, + tree_iclf, + naive_bayes_iclf, + ensemble_iclf, +] + +for clf in classifiers: + start = t.time() # Records time before training + clf.fit(im_train, lbl_train) + end = t.time() # Records time after tranining + print("training time:", end-start) + print(clf.score(im_test, lbl_test))