From 0cc1b233533a4667175c0f1b2eab9aefc9cca4f2 Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Sun, 30 Mar 2014 23:37:42 +0200 Subject: [PATCH] Added the functionality to add parsers and automatically use them. --- Fourmi.py | 26 ++++++++++++++------------ FourmiCrawler/parsers/parser.py | 11 +++++++---- FourmiCrawler/spider.py | 19 ++++++++++++------- 3 files changed, 33 insertions(+), 23 deletions(-) diff --git a/Fourmi.py b/Fourmi.py index 3d54c71..094a5d7 100755 --- a/Fourmi.py +++ b/Fourmi.py @@ -9,23 +9,25 @@ from scrapy.crawler import Crawler from scrapy import log, signals from FourmiCrawler.spider import FourmiSpider from scrapy.utils.project import get_project_settings +from FourmiCrawler.parsers.parser import Parser def setup_crawler(searchable): - # [TODO] - Initiate all parsers for the different websites and get - # allowed URLs. - spider = FourmiSpider(compound=searchable) - settings = get_project_settings() - crawler = Crawler(settings) - crawler.signals.connect(reactor.stop, signal=signals.spider_closed) - crawler.configure() - crawler.crawl(spider) - crawler.start() + # [TODO] - Initiate all parsers for the different websites and get allowed URLs. + spider = FourmiSpider(compound=searchable) + spider.add_parser(Parser()) + settings = get_project_settings() + crawler = Crawler(settings) + crawler.signals.connect(reactor.stop, signal=signals.spider_closed) + crawler.configure() + crawler.crawl(spider) + crawler.start() def start(): - setup_crawler("Methane") - log.start() - reactor.run() + setup_crawler("Methane") + log.start() + reactor.run() + start() diff --git a/FourmiCrawler/parsers/parser.py b/FourmiCrawler/parsers/parser.py index 3362d59..7097ee3 100644 --- a/FourmiCrawler/parsers/parser.py +++ b/FourmiCrawler/parsers/parser.py @@ -2,8 +2,11 @@ from scrapy import log class Parser: - website = "http://localhost/*" + ''' + website should be an regular expression of websites you want to parse. + ''' + website = "http://localhost/*" - def parse(self, reponse): - log.msg("The parse function of the empty parser was used.", level=log.Warning) - pass + def parse(self, reponse): + log.msg("The parse function of the empty parser was used.", level=log.WARNING) + pass \ No newline at end of file diff --git a/FourmiCrawler/spider.py b/FourmiCrawler/spider.py index a08d997..40d6dfc 100644 --- a/FourmiCrawler/spider.py +++ b/FourmiCrawler/spider.py @@ -1,19 +1,24 @@ from scrapy.spider import Spider +from scrapy import log +import re class FourmiSpider(Spider): name = "FourmiSpider" + start_urls = ["http://localhost/"] + parsers = [] def __init__(self, compound=None, *args, **kwargs): super(FourmiSpider, self).__init__(*args, **kwargs) self.synonyms = [compound] - -def parse(self, reponse): - # [TODO] - This function should delegate it's functionality to other - # parsers. - pass + def parse(self, reponse): + for parser in self.parsers: + if re.match(parser.website, reponse.url): + log.msg("Url: " + reponse.url + " -> Parser: " + parser.website, level=log.DEBUG) + return parser.parse(reponse) + return none -def add_parser(self, parser): - self.parsers.add(parser) + def add_parser(self, parser): + self.parsers.append(parser)