Added the functionality to add parsers and automatically use them.

2014-03-30 23:37:42 +02:00 · 2014-03-30 23:37:42 +02:00 · 0cc1b23353
commit 0cc1b23353
parent 6e2df64fe4
3 changed files with 33 additions and 23 deletions
--- a/Fourmi.py
+++ b/Fourmi.py
@ -9,23 +9,25 @@ from scrapy.crawler import Crawler
 from scrapy import log, signals
 from FourmiCrawler.spider import FourmiSpider
 from scrapy.utils.project import get_project_settings
+from FourmiCrawler.parsers.parser import Parser


 def setup_crawler(searchable):
-    # [TODO] - Initiate all parsers for the different websites and get
-    # allowed URLs.
-    spider = FourmiSpider(compound=searchable)
-    settings = get_project_settings()
-    crawler = Crawler(settings)
-    crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
-    crawler.configure()
-    crawler.crawl(spider)
-    crawler.start()
+	# [TODO] - Initiate all parsers for the different websites and get allowed URLs.
+	spider = FourmiSpider(compound=searchable)
+	spider.add_parser(Parser())
+	settings = get_project_settings()
+	crawler = Crawler(settings)
+	crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
+	crawler.configure()
+	crawler.crawl(spider)
+	crawler.start()


 def start():
-    setup_crawler("Methane")
-    log.start()
-    reactor.run()
+	setup_crawler("Methane")
+	log.start()
+	reactor.run()
+

 start()
--- a/FourmiCrawler/parsers/parser.py
+++ b/FourmiCrawler/parsers/parser.py
@ -2,8 +2,11 @@ from scrapy import log


 class Parser:
-    website = "http://localhost/*"
+	'''
+	website should be an regular expression of websites you want to parse.
+	'''
+	website = "http://localhost/*"

-    def parse(self, reponse):
-        log.msg("The parse function of the empty parser was used.", level=log.Warning)
-        pass
+	def parse(self, reponse):
+		log.msg("The parse function of the empty parser was used.", level=log.WARNING)
+		pass
--- a/FourmiCrawler/spider.py
+++ b/FourmiCrawler/spider.py
@ -1,19 +1,24 @@
 from scrapy.spider import Spider
+from scrapy import log
+import re


 class FourmiSpider(Spider):
 	name = "FourmiSpider"
+	start_urls = ["http://localhost/"]
+	parsers = []

 	def __init__(self, compound=None, *args, **kwargs):
 		super(FourmiSpider, self).__init__(*args, **kwargs)
 		self.synonyms = [compound]

-
-def parse(self, reponse):
-	# [TODO] - This function should delegate it's functionality to other
-	# parsers.
-	pass
+	def parse(self, reponse):
+		for parser in self.parsers:
+			if re.match(parser.website, reponse.url):
+				log.msg("Url: " + reponse.url + " -> Parser: " + parser.website, level=log.DEBUG)
+				return parser.parse(reponse)
+		return none


-def add_parser(self, parser):
-	self.parsers.add(parser)
+	def add_parser(self, parser):
+		self.parsers.append(parser)