Merge branch 'release/v0.2.0'

2014-04-22 19:10:51 +02:00 · 2014-04-22 19:10:51 +02:00 · 23189499e6
commit 23189499e6
parent 8cf307127a 67bc3f3144
3 changed files with 225 additions and 7 deletions
--- a/FourmiCrawler/parsers/ChemSpider.py
+++ b/FourmiCrawler/parsers/ChemSpider.py
@ -0,0 +1,218 @@
+from parser import Parser
+from scrapy import log
+from scrapy.http import Request
+from scrapy.selector import Selector
+from FourmiCrawler.items import Result
+import re
+
+# [TODO] - Maybe clean up usage of '.extract()[0]', because of possible IndexError exception.
+
+
+class ChemSpider(Parser):
+    """ChemSpider scraper for synonyms and properties
+
+    This parser will manage searching for chemicals through the
+    ChemsSpider API, and parsing the resulting ChemSpider page.
+    The token required for the API should be in a configuration file
+    somewhere.
+    """
+
+    def __init__(self):
+        pass
+
+    website = 'http://www.chemspider.com/*'
+
+    # [TODO] - Save and access token of specific user.
+    search = ('Search.asmx/SimpleSearch?query=%s&token='
+              '052bfd06-5ce4-43d6-bf12-89eabefd2338')
+    structure = 'Chemical-Structure.%s.html'
+    extendedinfo = ('MassSpecAPI.asmx/GetExtendedCompoundInfo?csid=%s&token='
+                    '052bfd06-5ce4-43d6-bf12-89eabefd2338')
+
+    ignore_list = []
+
+    def parse(self, response):
+        sel = Selector(response)
+        requests = []
+        requests_synonyms = self.parse_synonyms(sel)
+        requests.extend(requests_synonyms)
+        requests_properties = self.parse_properties(sel)
+        requests.extend(requests_properties)
+
+        return requests
+
+    @staticmethod
+    def parse_properties(sel):
+        """scrape Experimental Data and Predicted ACD/Labs tabs"""
+        properties = []
+
+        # Predicted - ACD/Labs tab
+        # [TODO] - test if tab contains data, some chemicals do not have data here
+        td_list = sel.xpath('.//table[@id="acdlabs-table"]//td').xpath(
+            'normalize-space(string())')
+        prop_names = td_list[::2]
+        prop_values = td_list[1::2]
+        for (prop_name, prop_value) in zip(prop_names, prop_values):
+            # [:-1] is to remove the colon at the end, [TODO] - test for colon
+            prop_name = prop_name.extract().encode('utf-8')[:-1]
+            prop_value = prop_value.extract().encode('utf-8')
+            prop_conditions = ''
+
+            # Match for condition in parentheses
+            m = re.match(r'(.*) \((.*)\)', prop_name)
+            if m:
+                prop_name = m.group(1)
+                prop_conditions = m.group(2)
+
+            # Match for condition in value seperated by an 'at'
+            m = re.match(r'(.*) at (.*)', prop_value)
+            if m:
+                prop_value = m.group(1)
+                prop_conditions = m.group(2)
+
+            new_prop = Result({
+                'attribute': prop_name,
+                'value': prop_value,
+                'source': 'ChemSpider Predicted - ACD/Labs Tab',
+                'reliability': 'Unknown',
+                'conditions': prop_conditions
+            })
+            properties.append(new_prop)
+            log.msg('CS prop: |%s| |%s| |%s|' %
+                    (new_prop['attribute'], new_prop['value'], new_prop['source']),
+                    level=log.DEBUG)
+
+        # Experimental Data Tab, Physico-chemical properties in particular
+        scraped_list = sel.xpath('.//li[span="Experimental Physico-chemical '
+                                 'Properties"]//li/table/tr/td')
+        if not scraped_list:
+            return properties
+        # Format is: property name followed by a list of values
+        property_name = scraped_list.pop(0).xpath(
+            'span/text()').extract()[0].rstrip()
+        for line in scraped_list:
+            if line.xpath('span/text()'):
+                property_name = line.xpath('span/text()').extract()[0].rstrip()
+            else:
+                new_prop = Result({
+                    'attribute': property_name[:-1],
+                    'value': line.xpath('text()').extract()[0].rstrip(),
+                    'source': line.xpath(
+                        'strong/text()').extract()[0].rstrip(),
+                    'reliability': 'Unknown',
+                    'conditions': ''
+                })
+                properties.append(new_prop)
+                log.msg('CS prop: |%s| |%s| |%s|' %
+                        (new_prop['attribute'], new_prop['value'],
+                         new_prop['source']), level=log.DEBUG)
+
+        return properties
+
+    def parse_synonyms(self, sel):
+        """Scrape list of Names and Identifiers"""
+        requests = []
+        synonyms = []
+
+        # Exact type for this is unknown, but equivalent to Validated by Expert
+        for syn in sel.xpath('//p[@class="syn"][span[@class="synonym_cn"]]'):
+            name = syn.xpath('span[@class="synonym_cn"]/text()').extract()[0]
+            synonyms.append(self.new_synonym(syn, name, 'expert'))
+        # These synonyms are labeled by ChemSpider as "Validated by Experts"
+        for syn in sel.xpath('//p[@class="syn"][strong]'):
+            name = syn.xpath('strong/text()').extract()[0]
+            synonyms.append(self.new_synonym(syn, name, 'expert'))
+        # These synonyms are labeled by ChemSpider as "Validated by Users"
+        for syn in sel.xpath(
+                '//p[@class="syn"][span[@class="synonym_confirmed"]]'):
+            name = syn.xpath(
+                'span[@class="synonym_confirmed"]/text()').extract()[0]
+            synonyms.append(self.new_synonym(syn, name, 'user'))
+        # These syonyms are labeled as "Non-validated" and assumed unreliable
+        for syn in sel.xpath('//p[@class="syn"][span[@class=""]]'):
+            name = syn.xpath('span[@class=""]/text()').extract()[0]
+            synonyms.append(self.new_synonym(syn, name, 'nonvalidated'))
+
+        # [TODO] - confirm if English User-Validated synonyms are OK too
+        for syn in synonyms:
+            if syn['category'] == 'expert' and syn['language'] == 'English':
+                log.msg('CS emit synonym: %s' % syn['name'], level=log.DEBUG)
+                self._spider.get_synonym_requests(syn['name'])
+
+        return requests
+
+    def new_synonym(self, sel, name, category):
+        """Scrape for a single synonym at a given HTML tag"""
+        self.ignore_list.append(name)
+        language = sel.xpath('span[@class="synonym_language"]/text()')
+        if language:
+            # The [1:-1] is to remove brackets around the language name
+            language = language.extract()[0][1:-1]
+        else:
+            # If language is not given, English is assumed, [TODO] - confirm
+            language = 'English'
+        log.msg('CS synonym: %s (%s) (%s)' % (name, category, language),
+                level=log.DEBUG)
+        references = []
+        # A synonym can have multiple references, each optionally with link
+        for ref in sel.xpath('span[@class="synonym_ref"]'):
+            refname = ref.xpath('normalize-space(string())')
+            references.append({
+                'name': refname.extract()[0][1:-1],
+                'URI': ''
+            })
+        for ref in sel.xpath('a[@class="synonym_ref"]'):
+            references.append({
+                'name': ref.xpath('@title').extract()[0],
+                'URI': ref.xpath('@href').extract()[0]
+            })
+        for ref in references:
+            log.msg('CS synonym ref: %s %s' % (ref['name'], ref['URI']),
+                    level=log.DEBUG)
+        synonym = {
+            'name': name,
+            'category': category,
+            'language': language,
+            'references': references
+        }
+        return synonym
+
+    @staticmethod
+    def parse_extendedinfo(response):
+        """Scrape data from the ChemSpider GetExtendedCompoundInfo API"""
+        sel = Selector(response)
+        properties = []
+        names = sel.xpath('*').xpath('name()').extract()
+        values = sel.xpath('*').xpath('text()').extract()
+        for (name, value) in zip(names, values):
+            result = Result({
+                'attribute': name,
+                'value': value,  # These values have no unit!
+                'source': 'ChemSpider ExtendedCompoundInfo',
+                'reliability': 'Unknown',
+                'conditions': ''
+            })
+            properties.append(result)
+        return properties
+
+    def parse_searchrequest(self, response):
+        """Parse the initial response of the ChemSpider Search API """
+        sel = Selector(response)
+        log.msg('chemspider parse_searchrequest', level=log.DEBUG)
+        sel.register_namespace('cs', 'http://www.chemspider.com/')
+        csid = sel.xpath('.//cs:int/text()').extract()[0]
+        # [TODO] - handle multiple csids in case of vague search term
+        structure_url = self.website[:-1] + self.structure % csid
+        extendedinfo_url = self.website[:-1] + self.extendedinfo % csid
+        log.msg('chemspider URL: %s' % structure_url, level=log.DEBUG)
+        return [Request(url=structure_url,
+                        callback=self.parse),
+                Request(url=extendedinfo_url,
+                        callback=self.parse_extendedinfo)]
+
+    def new_compound_request(self, compound):
+        if compound in self.ignore_list:  # [TODO] - add regular expression
+            return None
+        searchurl = self.website[:-1] + self.search % compound
+        log.msg('chemspider compound', level=log.DEBUG)
+        return Request(url=searchurl, callback=self.parse_searchrequest)
--- a/FourmiCrawler/parsers/parser.py
+++ b/FourmiCrawler/parsers/parser.py
@ -3,11 +3,11 @@ from scrapy import log


 class Parser:
-    '''
-    website should be an regular expression of the urls of request the parser is able to parse.
-    '''
-    website = "http://something/*"
-    __spider = None
+    website = "http://something/*"  # Regex of URI's the source is able to parse
+    _spider = None
+
+    def __init__(self):
+        pass

    def parse(self, reponse):
        log.msg("The parse function of the empty parser was used.", level=log.WARNING)
@ -18,4 +18,4 @@ class Parser:
        pass

    def set_spider(self, spider):
-        self.__spider = spider
+        self._spider = spider
--- a/fourmi.py
+++ b/fourmi.py
@ -86,7 +86,7 @@ def start_log(arguments):


 if __name__ == '__main__':
-    arguments = docopt.docopt(__doc__, version='Fourmi - V0.1.0')
+    arguments = docopt.docopt(__doc__, version='Fourmi - V0.2.0')
    start_log(arguments)
    settings = scrapy_settings_manipulation(arguments)
    setup_crawler(arguments["<compound>"], settings)