From 0cc1b233533a4667175c0f1b2eab9aefc9cca4f2 Mon Sep 17 00:00:00 2001
From: "Jip J. Dekker" <admin@recondor.com>
Date: Sun, 30 Mar 2014 23:37:42 +0200
Subject: [PATCH 1/8] Added the functionality to add parsers and automatically
 use them.

---
 Fourmi.py                       | 26 ++++++++++++++------------
 FourmiCrawler/parsers/parser.py | 11 +++++++----
 FourmiCrawler/spider.py         | 19 ++++++++++++-------
 3 files changed, 33 insertions(+), 23 deletions(-)

diff --git a/Fourmi.py b/Fourmi.py
index 3d54c71..094a5d7 100755
--- a/Fourmi.py
+++ b/Fourmi.py
@@ -9,23 +9,25 @@ from scrapy.crawler import Crawler
 from scrapy import log, signals
 from FourmiCrawler.spider import FourmiSpider
 from scrapy.utils.project import get_project_settings
+from FourmiCrawler.parsers.parser import Parser
 
 
 def setup_crawler(searchable):
-    # [TODO] - Initiate all parsers for the different websites and get
-    # allowed URLs.
-    spider = FourmiSpider(compound=searchable)
-    settings = get_project_settings()
-    crawler = Crawler(settings)
-    crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
-    crawler.configure()
-    crawler.crawl(spider)
-    crawler.start()
+	# [TODO] - Initiate all parsers for the different websites and get allowed URLs.
+	spider = FourmiSpider(compound=searchable)
+	spider.add_parser(Parser())
+	settings = get_project_settings()
+	crawler = Crawler(settings)
+	crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
+	crawler.configure()
+	crawler.crawl(spider)
+	crawler.start()
 
 
 def start():
-    setup_crawler("Methane")
-    log.start()
-    reactor.run()
+	setup_crawler("Methane")
+	log.start()
+	reactor.run()
+
 
 start()
diff --git a/FourmiCrawler/parsers/parser.py b/FourmiCrawler/parsers/parser.py
index 3362d59..7097ee3 100644
--- a/FourmiCrawler/parsers/parser.py
+++ b/FourmiCrawler/parsers/parser.py
@@ -2,8 +2,11 @@ from scrapy import log
 
 
 class Parser:
-    website = "http://localhost/*"
+	'''
+	website should be an regular expression of websites you want to parse.
+	'''
+	website = "http://localhost/*"
 
-    def parse(self, reponse):
-        log.msg("The parse function of the empty parser was used.", level=log.Warning)
-        pass
+	def parse(self, reponse):
+		log.msg("The parse function of the empty parser was used.", level=log.WARNING)
+		pass
\ No newline at end of file
diff --git a/FourmiCrawler/spider.py b/FourmiCrawler/spider.py
index a08d997..40d6dfc 100644
--- a/FourmiCrawler/spider.py
+++ b/FourmiCrawler/spider.py
@@ -1,19 +1,24 @@
 from scrapy.spider import Spider
+from scrapy import log
+import re
 
 
 class FourmiSpider(Spider):
 	name = "FourmiSpider"
+	start_urls = ["http://localhost/"]
+	parsers = []
 
 	def __init__(self, compound=None, *args, **kwargs):
 		super(FourmiSpider, self).__init__(*args, **kwargs)
 		self.synonyms = [compound]
 
-
-def parse(self, reponse):
-	# [TODO] - This function should delegate it's functionality to other
-	# parsers.
-	pass
+	def parse(self, reponse):
+		for parser in self.parsers:
+			if re.match(parser.website, reponse.url):
+				log.msg("Url: " + reponse.url + " -> Parser: " + parser.website, level=log.DEBUG)
+				return parser.parse(reponse)
+		return none
 
 
-def add_parser(self, parser):
-	self.parsers.add(parser)
+	def add_parser(self, parser):
+		self.parsers.append(parser)

From 4d9e5307bf0c00f1db07511affd1a7c389efe812 Mon Sep 17 00:00:00 2001
From: "Jip J. Dekker" <admin@recondor.com>
Date: Mon, 31 Mar 2014 00:48:45 +0200
Subject: [PATCH 2/8] Written an loader for all parsers in the parser
 directory.

---
 Fourmi.py               | 16 +++++++++++++---
 FourmiCrawler/spider.py |  4 +++-
 2 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/Fourmi.py b/Fourmi.py
index 094a5d7..c411b4a 100755
--- a/Fourmi.py
+++ b/Fourmi.py
@@ -9,13 +9,23 @@ from scrapy.crawler import Crawler
 from scrapy import log, signals
 from FourmiCrawler.spider import FourmiSpider
 from scrapy.utils.project import get_project_settings
-from FourmiCrawler.parsers.parser import Parser
+import os, inspect
 
+def load_parsers(rel_dir="FourmiCrawler/parsers"):
+	path = os.path.dirname(os.path.abspath(__file__))
+	path += "/" + rel_dir
+	parsers = []
+
+	for py in [f[:-3] for f in os.listdir(path) if f.endswith('.py') and f != '__init__.py']:
+		mod = __import__('.'.join(["FourmiCrawler.parsers", py]), fromlist=[py]) # [todo] - This module name should be derived from the rel_dir variable
+		classes = [getattr(mod, x) for x in dir(mod) if inspect.isclass(getattr(mod, x))]
+		for cls in classes:
+			parsers.append(cls()) # [review] - Would we ever need arguments for the parsers?
+	return parsers
 
 def setup_crawler(searchable):
-	# [TODO] - Initiate all parsers for the different websites and get allowed URLs.
 	spider = FourmiSpider(compound=searchable)
-	spider.add_parser(Parser())
+	spider.add_parsers(load_parsers())
 	settings = get_project_settings()
 	crawler = Crawler(settings)
 	crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
diff --git a/FourmiCrawler/spider.py b/FourmiCrawler/spider.py
index 40d6dfc..d2711c4 100644
--- a/FourmiCrawler/spider.py
+++ b/FourmiCrawler/spider.py
@@ -5,7 +5,6 @@ import re
 
 class FourmiSpider(Spider):
 	name = "FourmiSpider"
-	start_urls = ["http://localhost/"]
 	parsers = []
 
 	def __init__(self, compound=None, *args, **kwargs):
@@ -22,3 +21,6 @@ class FourmiSpider(Spider):
 
 	def add_parser(self, parser):
 		self.parsers.append(parser)
+
+	def add_parsers(self, parsers):
+		self.parsers.extend(parsers)

From e39ed3b68139fa38449cc8948938b59b0d0ff9f1 Mon Sep 17 00:00:00 2001
From: "Jip J. Dekker" <admin@recondor.com>
Date: Tue, 1 Apr 2014 20:56:32 +0200
Subject: [PATCH 3/8] Added a way for parsers to access the spider.

---
 FourmiCrawler/parsers/parser.py | 12 ++++++++++--
 FourmiCrawler/spider.py         | 13 +++++++------
 2 files changed, 17 insertions(+), 8 deletions(-)

diff --git a/FourmiCrawler/parsers/parser.py b/FourmiCrawler/parsers/parser.py
index 7097ee3..78d9dc1 100644
--- a/FourmiCrawler/parsers/parser.py
+++ b/FourmiCrawler/parsers/parser.py
@@ -5,8 +5,16 @@ class Parser:
 	'''
 	website should be an regular expression of websites you want to parse.
 	'''
-	website = "http://localhost/*"
+	website = "http://something/*"
+	__spider = None
 
 	def parse(self, reponse):
 		log.msg("The parse function of the empty parser was used.", level=log.WARNING)
-		pass
\ No newline at end of file
+		pass
+
+	def generate_search_url(self, compound):
+	 	# return website[:-1] + compound
+		pass
+
+	def set_spider(self, spider):
+		self.__spider = spider
diff --git a/FourmiCrawler/spider.py b/FourmiCrawler/spider.py
index d2711c4..9b356f8 100644
--- a/FourmiCrawler/spider.py
+++ b/FourmiCrawler/spider.py
@@ -16,11 +16,12 @@ class FourmiSpider(Spider):
 			if re.match(parser.website, reponse.url):
 				log.msg("Url: " + reponse.url + " -> Parser: " + parser.website, level=log.DEBUG)
 				return parser.parse(reponse)
-		return none
-
-
-	def add_parser(self, parser):
-		self.parsers.append(parser)
+		return None
 
 	def add_parsers(self, parsers):
-		self.parsers.extend(parsers)
+		for parser in parsers:
+			self.add_parser(parser)
+
+	def add_parser(self, parser):
+		self.parsers.add(parser)
+		parser.set_spider(self)
\ No newline at end of file

From f93dc2d1602b00c735236a02d7a7611be57657be Mon Sep 17 00:00:00 2001
From: "Jip J. Dekker" <admin@recondor.com>
Date: Tue, 1 Apr 2014 21:07:36 +0200
Subject: [PATCH 4/8] Added an structure to get requests for all websites for a
 new synonym

---
 FourmiCrawler/parsers/parser.py |  7 ++++---
 FourmiCrawler/spider.py         | 11 +++++++++--
 2 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/FourmiCrawler/parsers/parser.py b/FourmiCrawler/parsers/parser.py
index 78d9dc1..68f73cf 100644
--- a/FourmiCrawler/parsers/parser.py
+++ b/FourmiCrawler/parsers/parser.py
@@ -1,9 +1,10 @@
 from scrapy import log
+from scrapy.http import Request
 
 
 class Parser:
 	'''
-	website should be an regular expression of websites you want to parse.
+	website should be an regular expression of the urls of request the parser is able to parse.
 	'''
 	website = "http://something/*"
 	__spider = None
@@ -12,8 +13,8 @@ class Parser:
 		log.msg("The parse function of the empty parser was used.", level=log.WARNING)
 		pass
 
-	def generate_search_url(self, compound):
-	 	# return website[:-1] + compound
+	def new_compound_request(self, compound):
+		# return Request(url=self.website[:-1] + compound, callable=self.parse)
 		pass
 
 	def set_spider(self, spider):
diff --git a/FourmiCrawler/spider.py b/FourmiCrawler/spider.py
index 9b356f8..edd74a9 100644
--- a/FourmiCrawler/spider.py
+++ b/FourmiCrawler/spider.py
@@ -5,7 +5,7 @@ import re
 
 class FourmiSpider(Spider):
 	name = "FourmiSpider"
-	parsers = []
+	__parsers = []
 
 	def __init__(self, compound=None, *args, **kwargs):
 		super(FourmiSpider, self).__init__(*args, **kwargs)
@@ -18,10 +18,17 @@ class FourmiSpider(Spider):
 				return parser.parse(reponse)
 		return None
 
+	def get_synonym_requests(self, compound):
+		requests = []
+		for parser in self.parsers:
+			requests.append(parser.new_compound_request(compound))
+		return requests
+
+
 	def add_parsers(self, parsers):
 		for parser in parsers:
 			self.add_parser(parser)
 
 	def add_parser(self, parser):
-		self.parsers.add(parser)
+		self.__parsers.add(parser)
 		parser.set_spider(self)
\ No newline at end of file

From 683f8c09d44888eb165af1e9a738067a8ff621ea Mon Sep 17 00:00:00 2001
From: "Jip J. Dekker" <admin@recondor.com>
Date: Tue, 1 Apr 2014 21:12:54 +0200
Subject: [PATCH 5/8] Quick fix, python errors

---
 Fourmi.py                       | 2 +-
 FourmiCrawler/parsers/parser.py | 2 +-
 FourmiCrawler/spider.py         | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/Fourmi.py b/Fourmi.py
index c411b4a..a71400c 100755
--- a/Fourmi.py
+++ b/Fourmi.py
@@ -18,7 +18,7 @@ def load_parsers(rel_dir="FourmiCrawler/parsers"):
 
 	for py in [f[:-3] for f in os.listdir(path) if f.endswith('.py') and f != '__init__.py']:
 		mod = __import__('.'.join(["FourmiCrawler.parsers", py]), fromlist=[py]) # [todo] - This module name should be derived from the rel_dir variable
-		classes = [getattr(mod, x) for x in dir(mod) if inspect.isclass(getattr(mod, x))]
+		classes = [getattr(mod, x) for x in dir(mod) if inspect.isclass(getattr(mod, x))] # [fix] - This also finds classes that are imported.
 		for cls in classes:
 			parsers.append(cls()) # [review] - Would we ever need arguments for the parsers?
 	return parsers
diff --git a/FourmiCrawler/parsers/parser.py b/FourmiCrawler/parsers/parser.py
index 68f73cf..a3710c5 100644
--- a/FourmiCrawler/parsers/parser.py
+++ b/FourmiCrawler/parsers/parser.py
@@ -1,5 +1,5 @@
 from scrapy import log
-from scrapy.http import Request
+# from scrapy.http import Request
 
 
 class Parser:
diff --git a/FourmiCrawler/spider.py b/FourmiCrawler/spider.py
index edd74a9..3fc5ce0 100644
--- a/FourmiCrawler/spider.py
+++ b/FourmiCrawler/spider.py
@@ -30,5 +30,5 @@ class FourmiSpider(Spider):
 			self.add_parser(parser)
 
 	def add_parser(self, parser):
-		self.__parsers.add(parser)
+		self.__parsers.append(parser)
 		parser.set_spider(self)
\ No newline at end of file

From 0bf2d102c6ca7c6db2fb035a3774fe032155fa8e Mon Sep 17 00:00:00 2001
From: "Jip J. Dekker" <admin@recondor.com>
Date: Tue, 1 Apr 2014 21:21:30 +0200
Subject: [PATCH 6/8] Fixed parser importation, so it doesn't import imported
 classes.

---
 Fourmi.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/Fourmi.py b/Fourmi.py
index a71400c..2bed5cc 100755
--- a/Fourmi.py
+++ b/Fourmi.py
@@ -9,7 +9,7 @@ from scrapy.crawler import Crawler
 from scrapy import log, signals
 from FourmiCrawler.spider import FourmiSpider
 from scrapy.utils.project import get_project_settings
-import os, inspect
+import os, inspect, re
 
 def load_parsers(rel_dir="FourmiCrawler/parsers"):
 	path = os.path.dirname(os.path.abspath(__file__))
@@ -18,9 +18,10 @@ def load_parsers(rel_dir="FourmiCrawler/parsers"):
 
 	for py in [f[:-3] for f in os.listdir(path) if f.endswith('.py') and f != '__init__.py']:
 		mod = __import__('.'.join(["FourmiCrawler.parsers", py]), fromlist=[py]) # [todo] - This module name should be derived from the rel_dir variable
-		classes = [getattr(mod, x) for x in dir(mod) if inspect.isclass(getattr(mod, x))] # [fix] - This also finds classes that are imported.
+		classes = [getattr(mod, x) for x in dir(mod) if inspect.isclass(getattr(mod, x))]
 		for cls in classes:
-			parsers.append(cls()) # [review] - Would we ever need arguments for the parsers?
+			if re.match(path + "/*", inspect.getfile(cls)):
+				parsers.append(cls()) # [review] - Would we ever need arguments for the parsers?
 	return parsers
 
 def setup_crawler(searchable):

From cd421cc2fbf02e702b1a7fcf3db03c94cac77d30 Mon Sep 17 00:00:00 2001
From: "Jip J. Dekker" <admin@recondor.com>
Date: Tue, 1 Apr 2014 21:24:04 +0200
Subject: [PATCH 7/8] Replaced literal for testing with a variable fix.

---
 Fourmi.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Fourmi.py b/Fourmi.py
index 2bed5cc..015ae13 100755
--- a/Fourmi.py
+++ b/Fourmi.py
@@ -17,7 +17,7 @@ def load_parsers(rel_dir="FourmiCrawler/parsers"):
 	parsers = []
 
 	for py in [f[:-3] for f in os.listdir(path) if f.endswith('.py') and f != '__init__.py']:
-		mod = __import__('.'.join(["FourmiCrawler.parsers", py]), fromlist=[py]) # [todo] - This module name should be derived from the rel_dir variable
+		mod = __import__('.'.join([rel_dir.replace("/", "."), py]), fromlist=[py])
 		classes = [getattr(mod, x) for x in dir(mod) if inspect.isclass(getattr(mod, x))]
 		for cls in classes:
 			if re.match(path + "/*", inspect.getfile(cls)):

From 7bc160f67623dc382003680221fbd74d256441aa Mon Sep 17 00:00:00 2001
From: "Jip J. Dekker" <admin@recondor.com>
Date: Tue, 1 Apr 2014 21:38:11 +0200
Subject: [PATCH 8/8] The spider is now able to start using the synonym
 generator

---
 Fourmi.py                       |  6 +++---
 FourmiCrawler/parsers/parser.py |  2 +-
 FourmiCrawler/spider.py         | 17 +++++++++++++----
 3 files changed, 17 insertions(+), 8 deletions(-)

diff --git a/Fourmi.py b/Fourmi.py
index 015ae13..1a3e11b 100755
--- a/Fourmi.py
+++ b/Fourmi.py
@@ -24,8 +24,8 @@ def load_parsers(rel_dir="FourmiCrawler/parsers"):
 				parsers.append(cls()) # [review] - Would we ever need arguments for the parsers?
 	return parsers
 
-def setup_crawler(searchable):
-	spider = FourmiSpider(compound=searchable)
+def setup_crawler(searchables):
+	spider = FourmiSpider(compounds=searchables)
 	spider.add_parsers(load_parsers())
 	settings = get_project_settings()
 	crawler = Crawler(settings)
@@ -36,7 +36,7 @@ def setup_crawler(searchable):
 
 
 def start():
-	setup_crawler("Methane")
+	setup_crawler(["Methane"])
 	log.start()
 	reactor.run()
 
diff --git a/FourmiCrawler/parsers/parser.py b/FourmiCrawler/parsers/parser.py
index a3710c5..8499fea 100644
--- a/FourmiCrawler/parsers/parser.py
+++ b/FourmiCrawler/parsers/parser.py
@@ -14,7 +14,7 @@ class Parser:
 		pass
 
 	def new_compound_request(self, compound):
-		# return Request(url=self.website[:-1] + compound, callable=self.parse)
+		# return Request(url=self.website[:-1] + compound, callback=self.parse)
 		pass
 
 	def set_spider(self, spider):
diff --git a/FourmiCrawler/spider.py b/FourmiCrawler/spider.py
index 3fc5ce0..4d6b897 100644
--- a/FourmiCrawler/spider.py
+++ b/FourmiCrawler/spider.py
@@ -6,13 +6,17 @@ import re
 class FourmiSpider(Spider):
 	name = "FourmiSpider"
 	__parsers = []
+	synonyms = []
 
-	def __init__(self, compound=None, *args, **kwargs):
+	def __init__(self, compounds=None, *args, **kwargs):
 		super(FourmiSpider, self).__init__(*args, **kwargs)
-		self.synonyms = [compound]
+		if isinstance(compounds, list):
+			self.synonyms.extend(compounds)
+		else:
+			self.synonyms.append(compounds)
 
 	def parse(self, reponse):
-		for parser in self.parsers:
+		for parser in self.__parsers:
 			if re.match(parser.website, reponse.url):
 				log.msg("Url: " + reponse.url + " -> Parser: " + parser.website, level=log.DEBUG)
 				return parser.parse(reponse)
@@ -20,10 +24,15 @@ class FourmiSpider(Spider):
 
 	def get_synonym_requests(self, compound):
 		requests = []
-		for parser in self.parsers:
+		for parser in self.__parsers:
 			requests.append(parser.new_compound_request(compound))
 		return requests
 
+	def start_requests(self):
+		requests = []
+		for synonym in self.synonyms:
+			requests.extend(self.get_synonym_requests(synonym))
+		return requests
 
 	def add_parsers(self, parsers):
 		for parser in parsers: