From 5e13af5b1b68ba6b2ee58b81114a1756260a7f01 Mon Sep 17 00:00:00 2001 From: RTB Date: Tue, 10 Jun 2014 13:42:49 +0200 Subject: [PATCH] added function comments to ChemSpider --- FourmiCrawler/sources/ChemSpider.py | 65 +++++++++++++++++++++++++---- 1 file changed, 57 insertions(+), 8 deletions(-) diff --git a/FourmiCrawler/sources/ChemSpider.py b/FourmiCrawler/sources/ChemSpider.py index 87a6ee7..882c0b6 100644 --- a/FourmiCrawler/sources/ChemSpider.py +++ b/FourmiCrawler/sources/ChemSpider.py @@ -12,8 +12,8 @@ from FourmiCrawler.items import Result # [TODO] - Add checks at search request and extendedCompoundInfo on whether the token was valid or not class ChemSpider(Source): - """ChemSpider scraper for synonyms and properties - + """ + ChemSpider scraper for synonyms and properties This parser will manage searching for chemicals through the ChemsSpider API, and parsing the resulting ChemSpider page. The token required for the API should be in a configuration file @@ -27,6 +27,11 @@ class ChemSpider(Source): extendedinfo = 'MassSpecAPI.asmx/GetExtendedCompoundInfo?csid=%s&token=' def __init__(self, config={}): + """ + Initialization of ChemSpider scraper + :param config: a dictionary of settings for this scraper, must contain + 'reliability' key + """ Source.__init__(self, config) self.cfg = config self.ignore_list = [] @@ -37,8 +42,13 @@ class ChemSpider(Source): self.search += self.cfg['token'] self.extendedinfo += self.cfg['token'] - def parse(self, response): + """ + This function is called when a Response matching the variable + 'website' is available for parsing the Response object. + :param response: the Scrapy Response object to be parsed + :return: a list of Result items and Request objects + """ sel = Selector(response) requests = [] requests_synonyms = self.parse_synonyms(sel) @@ -49,7 +59,11 @@ class ChemSpider(Source): return requests def parse_properties(self, sel): - """scrape Experimental Data and Predicted ACD/Labs tabs""" + """ + This function scrapes the Experimental Data and Predicted ACD/Labs tabs + :param sel: a Selector object of the whole page + :return: a list of Result items + """ properties = [] # Predicted - ACD/Labs tab @@ -115,7 +129,11 @@ class ChemSpider(Source): return properties def parse_synonyms(self, sel): - """Scrape list of Names and Identifiers""" + """ + This function scrapes the list of Names and Identifiers + :param sel: a Selector object of the whole page + :return: a list of Requests + """ requests = [] synonyms = [] @@ -147,7 +165,13 @@ class ChemSpider(Source): return requests def new_synonym(self, sel, name, category): - """Scrape for a single synonym at a given HTML tag""" + """ + This function scrapes for a single synonym at a given HTML tag + :param sel: a Selector object of the given HTML tag + :param name: the name of the synonym in the tag + :param category: the name of the category the synonym is labeled as + :return: a dictionary containing data on the synonym + """ self.ignore_list.append(name) language = sel.xpath('span[@class="synonym_language"]/text()') if language: @@ -183,7 +207,12 @@ class ChemSpider(Source): return synonym def parse_extendedinfo(self, response): - """Scrape data from the ChemSpider GetExtendedCompoundInfo API""" + """ + This function scrapes data from the ChemSpider GetExtendedCompoundInfo + API, if a token is present in the configuration settings + :param response: a Response object to be parsed + :return: a list of Result items + """ sel = Selector(response) properties = [] names = sel.xpath('*').xpath('name()').extract() @@ -199,6 +228,15 @@ class ChemSpider(Source): return properties def newresult(self, attribute, value, conditions='', source='ChemSpider'): + """ + This function abstracts from the Result item and provides default + values. + :param attribute: the name of the attribute + :param value: the value of the attribute + :param conditions: optional conditions regarding the value + :param source: the name of the source if it is not ChemSpider + :return: A Result item + """ return Result({ 'attribute': attribute, 'value': value, @@ -208,7 +246,13 @@ class ChemSpider(Source): }) def parse_searchrequest(self, response): - """Parse the initial response of the ChemSpider Search API """ + """ + This function parses the initial response of the ChemSpider Search API + Requires a valid token to function. + :param response: the Response object to be parsed + :return: A Request for the information page and a Request for the + extendedinfo API call + """ sel = Selector(response) log.msg('chemspider parse_searchrequest', level=log.DEBUG) sel.register_namespace('cs', 'http://www.chemspider.com/') @@ -229,6 +273,11 @@ class ChemSpider(Source): callback=self.parse_extendedinfo)] def new_compound_request(self, compound): + """ + This function is called when a new synonym is returned to the spider + to generate new requests + :param compound: the name of the compound to search for + """ if compound in self.ignore_list or self.cfg['token'] == '': return None searchurl = self.website[:-1] + self.search % compound