From 306a37db1a9be535a0d624b8bf5e1004f218f43c Mon Sep 17 00:00:00 2001
From: "Jip J. Dekker" <admin@recondor.com>
Date: Sat, 22 Mar 2014 15:48:08 +0100
Subject: [PATCH 1/9] A better structure which is able to start multiple
 spiders.

---
 Fourmi.py | 27 +++++++++++++++++----------
 1 file changed, 17 insertions(+), 10 deletions(-)

diff --git a/Fourmi.py b/Fourmi.py
index 16029f9..640f9f7 100755
--- a/Fourmi.py
+++ b/Fourmi.py
@@ -10,13 +10,20 @@ from scrapy import log, signals
 from FourmiCrawler.spiders.Chemspider import ChemspiderSpider # [review] - There should be an easy way to import all spiders!
 from scrapy.utils.project import get_project_settings
 
-# [todo] - Add something to add all spiders, with the right references
-spider = ChemspiderSpider(compound = "Aspirin")
-settings = get_project_settings()
-crawler = Crawler(settings)
-crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
-crawler.configure()
-crawler.crawl(spider)
-crawler.start()
-log.start()
-reactor.run()
\ No newline at end of file
+defined_spiders = [ChemspiderSpider(compound = "Methane")]
+
+def setup_crawler(Spider, compound):
+  spider = FollowAllSpider(domain=domain) # [todo] - Do something smart to get the different spiders to work here.
+  settings = get_project_settings()
+  crawler = Crawler(settings)
+  crawler.configure()
+  crawler.crawl(spider)
+  crawler.start()
+
+def start():
+  for spider in defined_spiders:
+    setup_crawler(spider, compound)
+  log.start()
+  reactor.run()
+
+start()

From 8175e02f6c54e7b92f35f794a52655963e077e3a Mon Sep 17 00:00:00 2001
From: "Jip J. Dekker" <admin@recondor.com>
Date: Thu, 27 Mar 2014 13:08:46 +0100
Subject: [PATCH 2/9] New Structure, splitting on parsers instead of Spiders

---
 Fourmi.py                             | 11 ++++-------
 FourmiCrawler/spiders/Chemspider.py   | 12 ------------
 FourmiCrawler/spiders/Fourmispider.py | 12 ++++++++++++
 FourmiCrawler/spiders/Wikipedia.py    | 12 ------------
 4 files changed, 16 insertions(+), 31 deletions(-)
 delete mode 100644 FourmiCrawler/spiders/Chemspider.py
 create mode 100644 FourmiCrawler/spiders/Fourmispider.py
 delete mode 100644 FourmiCrawler/spiders/Wikipedia.py

diff --git a/Fourmi.py b/Fourmi.py
index 640f9f7..a0a9ead 100755
--- a/Fourmi.py
+++ b/Fourmi.py
@@ -7,13 +7,11 @@ Fourmi - An internet webcrawler searching for information on chemical compounds.
 from twisted.internet import reactor
 from scrapy.crawler import Crawler
 from scrapy import log, signals
-from FourmiCrawler.spiders.Chemspider import ChemspiderSpider # [review] - There should be an easy way to import all spiders!
+from FourmiCrawler.spiders.Fourmispider import FourmiSpider
 from scrapy.utils.project import get_project_settings
 
-defined_spiders = [ChemspiderSpider(compound = "Methane")]
-
-def setup_crawler(Spider, compound):
-  spider = FollowAllSpider(domain=domain) # [todo] - Do something smart to get the different spiders to work here.
+def setup_crawler(compound):
+  spider = FourmiSpider(domain=domain) # [todo] - Do something smart to get the different spiders to work here.
   settings = get_project_settings()
   crawler = Crawler(settings)
   crawler.configure()
@@ -21,8 +19,7 @@ def setup_crawler(Spider, compound):
   crawler.start()
 
 def start():
-  for spider in defined_spiders:
-    setup_crawler(spider, compound)
+  setup_crawler(compound)
   log.start()
   reactor.run()
 
diff --git a/FourmiCrawler/spiders/Chemspider.py b/FourmiCrawler/spiders/Chemspider.py
deleted file mode 100644
index b85b44d..0000000
--- a/FourmiCrawler/spiders/Chemspider.py
+++ /dev/null
@@ -1,12 +0,0 @@
-from scrapy.spider import Spider
-
-class ChemspiderSpider(Spider):
-    name = "Chemspider"
-    allowed_domains = ["chemspider.com"]
-
-    def __init__(self, compound=None, *args, **kwargs):
-        super(ChemspiderSpider, self).__init__(*args, **kwargs)
-        self.start_urls = ["http://chemspiderapiurl/something/%s" % compound] #[TODO] - Give an logical start url.
-
-    def parse(self, response):
-        pass 
diff --git a/FourmiCrawler/spiders/Fourmispider.py b/FourmiCrawler/spiders/Fourmispider.py
new file mode 100644
index 0000000..f7b64bd
--- /dev/null
+++ b/FourmiCrawler/spiders/Fourmispider.py
@@ -0,0 +1,12 @@
+from scrapy.spider import Spider
+
+class FourmiSpider(Spider):
+  name="FourmiSpider"
+
+  def __init__(self, compound=None, *args, **kwargs):
+    super(FourmiSpider, self).__init__(*args, **kwargs)
+    # [TODO] - Initiate all parsers for the different websites and get allowed URLs.
+        
+  def parse(self, reponse):
+    # [TODO] - This function should delegate it's functionality to other parsers.
+    pass
diff --git a/FourmiCrawler/spiders/Wikipedia.py b/FourmiCrawler/spiders/Wikipedia.py
deleted file mode 100644
index 62ed026..0000000
--- a/FourmiCrawler/spiders/Wikipedia.py
+++ /dev/null
@@ -1,12 +0,0 @@
-from scrapy.spider import Spider
-
-class WikipediaSpider(Spider):
-    name = "Wikipedia"
-    allowed_domains = ["wikipedia.org"]
-
-    def __init__(self, compound=None, *args, **kwargs):
-        super(WikipediaSpider, self).__init__(*args, **kwargs)
-        self.start_urls = ["http://wikipediaurl/something/%s" % compound] #[TODO] - Give an logical start url.
-
-    def parse(self, response):
-        pass 

From bdcf359da7c5fdda98dade75e5de908edb4d1f32 Mon Sep 17 00:00:00 2001
From: "Jip J. Dekker" <admin@recondor.com>
Date: Thu, 27 Mar 2014 13:12:27 +0100
Subject: [PATCH 3/9] Logical fixes to have some "working" case

---
 Fourmi.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/Fourmi.py b/Fourmi.py
index a0a9ead..9bdec24 100755
--- a/Fourmi.py
+++ b/Fourmi.py
@@ -10,8 +10,8 @@ from scrapy import log, signals
 from FourmiCrawler.spiders.Fourmispider import FourmiSpider
 from scrapy.utils.project import get_project_settings
 
-def setup_crawler(compound):
-  spider = FourmiSpider(domain=domain) # [todo] - Do something smart to get the different spiders to work here.
+def setup_crawler(searchable):
+  spider = FourmiSpider(compound=searchable) # [todo] - Do something smart to get the different spiders to work here.
   settings = get_project_settings()
   crawler = Crawler(settings)
   crawler.configure()
@@ -19,7 +19,7 @@ def setup_crawler(compound):
   crawler.start()
 
 def start():
-  setup_crawler(compound)
+  setup_crawler("Methane")
   log.start()
   reactor.run()
 

From 8e9314e753c2390485c96a56364d69dbc0e4f80c Mon Sep 17 00:00:00 2001
From: "Jip J. Dekker" <admin@recondor.com>
Date: Thu, 27 Mar 2014 13:18:55 +0100
Subject: [PATCH 4/9] One spider should have it's own folder

---
 FourmiCrawler/settings.py                            | 4 ++--
 FourmiCrawler/{spiders/Fourmispider.py => spider.py} | 0
 FourmiCrawler/spiders/__init__.py                    | 4 ----
 3 files changed, 2 insertions(+), 6 deletions(-)
 rename FourmiCrawler/{spiders/Fourmispider.py => spider.py} (100%)
 delete mode 100644 FourmiCrawler/spiders/__init__.py

diff --git a/FourmiCrawler/settings.py b/FourmiCrawler/settings.py
index 0f5eae8..28272d0 100644
--- a/FourmiCrawler/settings.py
+++ b/FourmiCrawler/settings.py
@@ -8,8 +8,8 @@
 
 BOT_NAME = 'FourmiCrawler'
 
-SPIDER_MODULES = ['FourmiCrawler.spiders']
-NEWSPIDER_MODULE = 'FourmiCrawler.spiders'
+SPIDER_MODULES = ['FourmiCrawler']
+NEWSPIDER_MODULE = 'FourmiCrawler'
 ITEM_PIPELINES = {
     'FourmiCrawler.pipelines.FourmiPipeline': 100
 }
diff --git a/FourmiCrawler/spiders/Fourmispider.py b/FourmiCrawler/spider.py
similarity index 100%
rename from FourmiCrawler/spiders/Fourmispider.py
rename to FourmiCrawler/spider.py
diff --git a/FourmiCrawler/spiders/__init__.py b/FourmiCrawler/spiders/__init__.py
deleted file mode 100644
index ebd689a..0000000
--- a/FourmiCrawler/spiders/__init__.py
+++ /dev/null
@@ -1,4 +0,0 @@
-# This package will contain the spiders of your Scrapy project
-#
-# Please refer to the documentation for information on how to create and manage
-# your spiders.

From 5b17627504672dc90f0f744d942d2c5c9a055d78 Mon Sep 17 00:00:00 2001
From: "Jip J. Dekker" <admin@recondor.com>
Date: Thu, 27 Mar 2014 13:23:03 +0100
Subject: [PATCH 5/9] The parsers however could use their own folder

---
 FourmiCrawler/parsers/__init__.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 FourmiCrawler/parsers/__init__.py

diff --git a/FourmiCrawler/parsers/__init__.py b/FourmiCrawler/parsers/__init__.py
new file mode 100644
index 0000000..e69de29

From 87d10415177ceae3485956bc13450891d3f51182 Mon Sep 17 00:00:00 2001
From: "Jip J. Dekker" <admin@recondor.com>
Date: Fri, 28 Mar 2014 14:11:36 +0100
Subject: [PATCH 6/9] Made all Python files PEP-8 Compatible

---
 Fourmi.py                 | 24 +++++++++++++-----------
 FourmiCrawler/items.py    |  3 ++-
 FourmiCrawler/settings.py |  6 ++++--
 FourmiCrawler/spider.py   | 21 ++++++++++++---------
 4 files changed, 31 insertions(+), 23 deletions(-)

diff --git a/Fourmi.py b/Fourmi.py
index 9bdec24..96c808e 100755
--- a/Fourmi.py
+++ b/Fourmi.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env python
 """
-Fourmi - An internet webcrawler searching for information on chemical compounds.
-[todo] - Add some more useful text here.
+Fourmi - An internet webcrawler searching for information on chemical
+compounds. [todo] - Add some more useful text here.
 """
 
 from twisted.internet import reactor
@@ -10,17 +10,19 @@ from scrapy import log, signals
 from FourmiCrawler.spiders.Fourmispider import FourmiSpider
 from scrapy.utils.project import get_project_settings
 
+
 def setup_crawler(searchable):
-  spider = FourmiSpider(compound=searchable) # [todo] - Do something smart to get the different spiders to work here.
-  settings = get_project_settings()
-  crawler = Crawler(settings)
-  crawler.configure()
-  crawler.crawl(spider)
-  crawler.start()
+    spider = FourmiSpider(compound=searchable)
+    settings = get_project_settings()
+    crawler = Crawler(settings)
+    crawler.configure()
+    crawler.crawl(spider)
+    crawler.start()
+
 
 def start():
-  setup_crawler("Methane")
-  log.start()
-  reactor.run()
+    setup_crawler("Methane")
+    log.start()
+    reactor.run()
 
 start()
diff --git a/FourmiCrawler/items.py b/FourmiCrawler/items.py
index 5fedc36..c7fd41c 100644
--- a/FourmiCrawler/items.py
+++ b/FourmiCrawler/items.py
@@ -5,9 +5,10 @@
 
 from scrapy.item import Item, Field
 
+
 class Result(Item):
     attribute = Field()
     value = Field()
     source = Field()
     reliability = Field()
-    conditions = Field()
\ No newline at end of file
+    conditions = Field()
diff --git a/FourmiCrawler/settings.py b/FourmiCrawler/settings.py
index 28272d0..b025167 100644
--- a/FourmiCrawler/settings.py
+++ b/FourmiCrawler/settings.py
@@ -14,5 +14,7 @@ ITEM_PIPELINES = {
     'FourmiCrawler.pipelines.FourmiPipeline': 100
 }
 
-# Crawl responsibly by identifying yourself (and your website) on the user-agent
-#USER_AGENT = 'FourmiCrawler (+http://www.yourdomain.com)'
+# Crawl responsibly by identifying yourself (and your website) on the
+# user-agent
+
+# USER_AGENT = 'FourmiCrawler (+http://www.yourdomain.com)'
diff --git a/FourmiCrawler/spider.py b/FourmiCrawler/spider.py
index f7b64bd..2805c8e 100644
--- a/FourmiCrawler/spider.py
+++ b/FourmiCrawler/spider.py
@@ -1,12 +1,15 @@
 from scrapy.spider import Spider
 
-class FourmiSpider(Spider):
-  name="FourmiSpider"
 
-  def __init__(self, compound=None, *args, **kwargs):
-    super(FourmiSpider, self).__init__(*args, **kwargs)
-    # [TODO] - Initiate all parsers for the different websites and get allowed URLs.
-        
-  def parse(self, reponse):
-    # [TODO] - This function should delegate it's functionality to other parsers.
-    pass
+class FourmiSpider(Spider):
+    name = "FourmiSpider"
+
+    def __init__(self, compound=None, *args, **kwargs):
+        super(FourmiSpider, self).__init__(*args, **kwargs)
+        # [TODO] - Initiate all parsers for the different websites and get
+        # allowed URLs.
+
+    def parse(self, reponse):
+        # [TODO] - This function should delegate it's functionality to other
+        # parsers.
+        pass

From d91706d6e52892ac1b8bccbe74792bdcf9255ffe Mon Sep 17 00:00:00 2001
From: "Jip J. Dekker" <admin@recondor.com>
Date: Fri, 28 Mar 2014 14:14:39 +0100
Subject: [PATCH 7/9] The script should stop sometime, added a stopping signal

---
 Fourmi.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Fourmi.py b/Fourmi.py
index 96c808e..533240e 100755
--- a/Fourmi.py
+++ b/Fourmi.py
@@ -15,6 +15,7 @@ def setup_crawler(searchable):
     spider = FourmiSpider(compound=searchable)
     settings = get_project_settings()
     crawler = Crawler(settings)
+    crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
     crawler.configure()
     crawler.crawl(spider)
     crawler.start()

From 325febe834feaff06f9ceab4462fab17720902ce Mon Sep 17 00:00:00 2001
From: "Jip J. Dekker" <admin@recondor.com>
Date: Fri, 28 Mar 2014 14:43:22 +0100
Subject: [PATCH 8/9] Added an basic parser class to extend, next step
 implementing the global function

---
 Fourmi.py               | 2 ++
 FourmiCrawler/spider.py | 5 +++--
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/Fourmi.py b/Fourmi.py
index 533240e..f1bf1ba 100755
--- a/Fourmi.py
+++ b/Fourmi.py
@@ -12,6 +12,8 @@ from scrapy.utils.project import get_project_settings
 
 
 def setup_crawler(searchable):
+    # [TODO] - Initiate all parsers for the different websites and get
+    # allowed URLs.
     spider = FourmiSpider(compound=searchable)
     settings = get_project_settings()
     crawler = Crawler(settings)
diff --git a/FourmiCrawler/spider.py b/FourmiCrawler/spider.py
index 2805c8e..4c25df9 100644
--- a/FourmiCrawler/spider.py
+++ b/FourmiCrawler/spider.py
@@ -6,10 +6,11 @@ class FourmiSpider(Spider):
 
     def __init__(self, compound=None, *args, **kwargs):
         super(FourmiSpider, self).__init__(*args, **kwargs)
-        # [TODO] - Initiate all parsers for the different websites and get
-        # allowed URLs.
 
     def parse(self, reponse):
         # [TODO] - This function should delegate it's functionality to other
         # parsers.
         pass
+
+    def add_parser(self, parser):
+        self.parsers.add(parser)

From 32cedecf2e3ebab0c965457d003a5293f1115d91 Mon Sep 17 00:00:00 2001
From: "Jip J. Dekker" <admin@recondor.com>
Date: Fri, 28 Mar 2014 14:43:22 +0100
Subject: [PATCH 9/9] Added an basic parser class to extend, next step
 implementing the global function

---
 Fourmi.py                       | 2 ++
 FourmiCrawler/parsers/parser.py | 9 +++++++++
 FourmiCrawler/spider.py         | 5 +++--
 3 files changed, 14 insertions(+), 2 deletions(-)
 create mode 100644 FourmiCrawler/parsers/parser.py

diff --git a/Fourmi.py b/Fourmi.py
index 533240e..f1bf1ba 100755
--- a/Fourmi.py
+++ b/Fourmi.py
@@ -12,6 +12,8 @@ from scrapy.utils.project import get_project_settings
 
 
 def setup_crawler(searchable):
+    # [TODO] - Initiate all parsers for the different websites and get
+    # allowed URLs.
     spider = FourmiSpider(compound=searchable)
     settings = get_project_settings()
     crawler = Crawler(settings)
diff --git a/FourmiCrawler/parsers/parser.py b/FourmiCrawler/parsers/parser.py
new file mode 100644
index 0000000..3362d59
--- /dev/null
+++ b/FourmiCrawler/parsers/parser.py
@@ -0,0 +1,9 @@
+from scrapy import log
+
+
+class Parser:
+    website = "http://localhost/*"
+
+    def parse(self, reponse):
+        log.msg("The parse function of the empty parser was used.", level=log.Warning)
+        pass
diff --git a/FourmiCrawler/spider.py b/FourmiCrawler/spider.py
index 2805c8e..4c25df9 100644
--- a/FourmiCrawler/spider.py
+++ b/FourmiCrawler/spider.py
@@ -6,10 +6,11 @@ class FourmiSpider(Spider):
 
     def __init__(self, compound=None, *args, **kwargs):
         super(FourmiSpider, self).__init__(*args, **kwargs)
-        # [TODO] - Initiate all parsers for the different websites and get
-        # allowed URLs.
 
     def parse(self, reponse):
         # [TODO] - This function should delegate it's functionality to other
         # parsers.
         pass
+
+    def add_parser(self, parser):
+        self.parsers.add(parser)