From f93dc2d1602b00c735236a02d7a7611be57657be Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Tue, 1 Apr 2014 21:07:36 +0200 Subject: [PATCH] Added an structure to get requests for all websites for a new synonym --- FourmiCrawler/parsers/parser.py | 7 ++++--- FourmiCrawler/spider.py | 11 +++++++++-- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/FourmiCrawler/parsers/parser.py b/FourmiCrawler/parsers/parser.py index 78d9dc1..68f73cf 100644 --- a/FourmiCrawler/parsers/parser.py +++ b/FourmiCrawler/parsers/parser.py @@ -1,9 +1,10 @@ from scrapy import log +from scrapy.http import Request class Parser: ''' - website should be an regular expression of websites you want to parse. + website should be an regular expression of the urls of request the parser is able to parse. ''' website = "http://something/*" __spider = None @@ -12,8 +13,8 @@ class Parser: log.msg("The parse function of the empty parser was used.", level=log.WARNING) pass - def generate_search_url(self, compound): - # return website[:-1] + compound + def new_compound_request(self, compound): + # return Request(url=self.website[:-1] + compound, callable=self.parse) pass def set_spider(self, spider): diff --git a/FourmiCrawler/spider.py b/FourmiCrawler/spider.py index 9b356f8..edd74a9 100644 --- a/FourmiCrawler/spider.py +++ b/FourmiCrawler/spider.py @@ -5,7 +5,7 @@ import re class FourmiSpider(Spider): name = "FourmiSpider" - parsers = [] + __parsers = [] def __init__(self, compound=None, *args, **kwargs): super(FourmiSpider, self).__init__(*args, **kwargs) @@ -18,10 +18,17 @@ class FourmiSpider(Spider): return parser.parse(reponse) return None + def get_synonym_requests(self, compound): + requests = [] + for parser in self.parsers: + requests.append(parser.new_compound_request(compound)) + return requests + + def add_parsers(self, parsers): for parser in parsers: self.add_parser(parser) def add_parser(self, parser): - self.parsers.add(parser) + self.__parsers.add(parser) parser.set_spider(self) \ No newline at end of file