From c27a875d681d0f912570bef4a583b85ea483bdbe Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Sun, 1 Jun 2014 20:18:03 +0200 Subject: [PATCH] Parser/Source consistency --- FourmiCrawler/spider.py | 32 ++++++++++++++++---------------- fourmi.py | 2 +- 2 files changed, 17 insertions(+), 17 deletions(-) diff --git a/FourmiCrawler/spider.py b/FourmiCrawler/spider.py index a58b6ea..08abb6b 100644 --- a/FourmiCrawler/spider.py +++ b/FourmiCrawler/spider.py @@ -9,7 +9,7 @@ class FourmiSpider(Spider): A spider writen for the Fourmi Project which calls upon all available sources to request and scrape data. """ name = "FourmiSpider" - __parsers = [] + __sources = [] synonyms = [] def __init__(self, compound=None, selected_attributes=[".*"], *args, **kwargs): @@ -25,14 +25,14 @@ class FourmiSpider(Spider): def parse(self, response): """ The function that is called when a response to a request is available. This function distributes this to a - parser which should be able to handle parsing the data. + source which should be able to handle parsing the data. :param response: A Scrapy Response object that should be parsed :return: A list of Result items and new Request to be handled by the scrapy core. """ - for parser in self.__parsers: - if re.match(parser.website, response.url): - log.msg("Url: " + response.url + " -> Source: " + parser.website, level=log.DEBUG) - return parser.parse(response) + for source in self.__sources: + if re.match(source.website, response.url): + log.msg("Url: " + response.url + " -> Source: " + source.website, level=log.DEBUG) + return source.parse(response) return None def get_synonym_requests(self, compound): @@ -42,7 +42,7 @@ class FourmiSpider(Spider): :return: A list of Scrapy Request objects """ requests = [] - for parser in self.__parsers: + for parser in self.__sources: parser_requests = parser.new_compound_request(compound) if parser_requests is not None: requests.append(parser_requests) @@ -58,18 +58,18 @@ class FourmiSpider(Spider): requests.extend(self.get_synonym_requests(synonym)) return requests - def add_parsers(self, parsers): + def add_sources(self, sources): """ - A function to add a new Parser objects to the list of available parsers. - :param parsers: A list of Parser Objects. + A function to add a new Parser objects to the list of available sources. + :param sources: A list of Source Objects. """ - for parser in parsers: - self.add_parser(parser) + for parser in sources: + self.add_source(parser) - def add_parser(self, parser): + def add_source(self, source): """ A function add a new Parser object to the list of available parsers. - :param parser: A Parser Object + :param source: A Source Object """ - self.__parsers.append(parser) - parser.set_spider(self) \ No newline at end of file + self.__sources.append(source) + source.set_spider(self) \ No newline at end of file diff --git a/fourmi.py b/fourmi.py index 9f32cff..945c8a2 100755 --- a/fourmi.py +++ b/fourmi.py @@ -42,7 +42,7 @@ def setup_crawler(compound, settings, source_loader, attributes): :param attributes: A list of regular expressions which the attribute names should match. """ spider = FourmiSpider(compound=compound, selected_attributes=attributes) - spider.add_parsers(source_loader.sources) + spider.add_sources(source_loader.sources) crawler = Crawler(settings) crawler.signals.connect(reactor.stop, signal=signals.spider_closed) crawler.configure()