Archived
1
0

Parser/Source consistency

This commit is contained in:
Jip J. Dekker 2014-06-01 20:18:03 +02:00
parent 3499946e97
commit c27a875d68
2 changed files with 17 additions and 17 deletions

View File

@ -9,7 +9,7 @@ class FourmiSpider(Spider):
A spider writen for the Fourmi Project which calls upon all available sources to request and scrape data. A spider writen for the Fourmi Project which calls upon all available sources to request and scrape data.
""" """
name = "FourmiSpider" name = "FourmiSpider"
__parsers = [] __sources = []
synonyms = [] synonyms = []
def __init__(self, compound=None, selected_attributes=[".*"], *args, **kwargs): def __init__(self, compound=None, selected_attributes=[".*"], *args, **kwargs):
@ -25,14 +25,14 @@ class FourmiSpider(Spider):
def parse(self, response): def parse(self, response):
""" """
The function that is called when a response to a request is available. This function distributes this to a The function that is called when a response to a request is available. This function distributes this to a
parser which should be able to handle parsing the data. source which should be able to handle parsing the data.
:param response: A Scrapy Response object that should be parsed :param response: A Scrapy Response object that should be parsed
:return: A list of Result items and new Request to be handled by the scrapy core. :return: A list of Result items and new Request to be handled by the scrapy core.
""" """
for parser in self.__parsers: for source in self.__sources:
if re.match(parser.website, response.url): if re.match(source.website, response.url):
log.msg("Url: " + response.url + " -> Source: " + parser.website, level=log.DEBUG) log.msg("Url: " + response.url + " -> Source: " + source.website, level=log.DEBUG)
return parser.parse(response) return source.parse(response)
return None return None
def get_synonym_requests(self, compound): def get_synonym_requests(self, compound):
@ -42,7 +42,7 @@ class FourmiSpider(Spider):
:return: A list of Scrapy Request objects :return: A list of Scrapy Request objects
""" """
requests = [] requests = []
for parser in self.__parsers: for parser in self.__sources:
parser_requests = parser.new_compound_request(compound) parser_requests = parser.new_compound_request(compound)
if parser_requests is not None: if parser_requests is not None:
requests.append(parser_requests) requests.append(parser_requests)
@ -58,18 +58,18 @@ class FourmiSpider(Spider):
requests.extend(self.get_synonym_requests(synonym)) requests.extend(self.get_synonym_requests(synonym))
return requests return requests
def add_parsers(self, parsers): def add_sources(self, sources):
""" """
A function to add a new Parser objects to the list of available parsers. A function to add a new Parser objects to the list of available sources.
:param parsers: A list of Parser Objects. :param sources: A list of Source Objects.
""" """
for parser in parsers: for parser in sources:
self.add_parser(parser) self.add_source(parser)
def add_parser(self, parser): def add_source(self, source):
""" """
A function add a new Parser object to the list of available parsers. A function add a new Parser object to the list of available parsers.
:param parser: A Parser Object :param source: A Source Object
""" """
self.__parsers.append(parser) self.__sources.append(source)
parser.set_spider(self) source.set_spider(self)

View File

@ -42,7 +42,7 @@ def setup_crawler(compound, settings, source_loader, attributes):
:param attributes: A list of regular expressions which the attribute names should match. :param attributes: A list of regular expressions which the attribute names should match.
""" """
spider = FourmiSpider(compound=compound, selected_attributes=attributes) spider = FourmiSpider(compound=compound, selected_attributes=attributes)
spider.add_parsers(source_loader.sources) spider.add_sources(source_loader.sources)
crawler = Crawler(settings) crawler = Crawler(settings)
crawler.signals.connect(reactor.stop, signal=signals.spider_closed) crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
crawler.configure() crawler.configure()