Merge branch 'feature/sources-configuration' into develop

2014-06-08 22:36:02 +02:00 · 2014-06-08 22:36:02 +02:00 · 1ab7d0ba76
commit 1ab7d0ba76
parent efb7d60079 326413effa
10 changed files with 187 additions and 119 deletions
--- a/.gitignore
+++ b/.gitignore
@ -4,6 +4,9 @@
 #Python Specific ignores
 *.pyc
 #may contain authentication information
 sources.cfg
 #THINGS WE WOULD NEVER EVER WANT!
 #ignore thumbnails created by windows
 Thumbs.db
--- a/FourmiCrawler/sources/ChemSpider.py
+++ b/FourmiCrawler/sources/ChemSpider.py
@ -9,7 +9,7 @@ from FourmiCrawler.items import Result
 # [TODO] - Maybe clean up usage of '.extract()[0]', because of possible IndexError exception.
-
+# [TODO] - Add checks at search request and extendedCompoundInfo on whether the token was valid or not
 class ChemSpider(Source):
    """ChemSpider scraper for synonyms and properties
@ -20,19 +20,23 @@ class ChemSpider(Source):
    somewhere.
    """
    def __init__(self):
        Source.__init__(self)
    website = 'http://www.chemspider.com/*'
-    # [TODO] - Save and access token of specific user.
+    search = 'Search.asmx/SimpleSearch?query=%s&token='
    search = ('Search.asmx/SimpleSearch?query=%s&token='
              '052bfd06-5ce4-43d6-bf12-89eabefd2338')
    structure = 'Chemical-Structure.%s.html'
-    extendedinfo = ('MassSpecAPI.asmx/GetExtendedCompoundInfo?csid=%s&token='
+    extendedinfo = 'MassSpecAPI.asmx/GetExtendedCompoundInfo?csid=%s&token='
-                    '052bfd06-5ce4-43d6-bf12-89eabefd2338')
+
    def __init__(self, config={}):
        Source.__init__(self, config)
        self.cfg = config
        self.ignore_list = []
        if 'token' not in self.cfg or self.cfg['token'] == '':
            log.msg('ChemSpider token not set or empty, search/MassSpec API '
                    'not available', level=log.WARNING)
            self.cfg['token'] = ''
        self.search += self.cfg['token']
        self.extendedinfo += self.cfg['token']
    ignore_list = []
    def parse(self, response):
        sel = Selector(response)
@ -44,8 +48,7 @@ class ChemSpider(Source):
        return requests
-    @staticmethod
+    def parse_properties(self, sel):
    def parse_properties(sel):
        """scrape Experimental Data and Predicted ACD/Labs tabs"""
        properties = []
@ -76,13 +79,12 @@ class ChemSpider(Source):
                prop_value = m.group(1)
                prop_conditions = m.group(2)
-            new_prop = Result({
+            new_prop = self.newresult(
-                'attribute': prop_name,
+                attribute=prop_name,
-                'value': prop_value,
+                value=prop_value,
-                'source': 'ChemSpider Predicted - ACD/Labs Tab',
+                source='ChemSpider Predicted - ACD/Labs Tab',
-                'reliability': 'Unknown',
+                conditions=prop_conditions
-                'conditions': prop_conditions
+            )
            })
            properties.append(new_prop)
            log.msg('CS prop: |%s| |%s| |%s|' %
                    (new_prop['attribute'], new_prop['value'], new_prop['source']),
@ -100,14 +102,11 @@ class ChemSpider(Source):
            if line.xpath('span/text()'):
                property_name = line.xpath('span/text()').extract()[0].rstrip()
            else:
-                new_prop = Result({
+                new_prop = self.newresult(
-                    'attribute': property_name[:-1],
+                    attribute=property_name[:-1],
-                    'value': line.xpath('text()').extract()[0].rstrip(),
+                    value=line.xpath('text()').extract()[0].rstrip(),
-                    'source': line.xpath(
+                    source=line.xpath('strong/text()').extract()[0].rstrip(),
-                        'strong/text()').extract()[0].rstrip(),
+                )
                    'reliability': 'Unknown',
                    'conditions': ''
                })
                properties.append(new_prop)
                log.msg('CS prop: |%s| |%s| |%s|' %
                        (new_prop['attribute'], new_prop['value'],
@ -183,25 +182,31 @@ class ChemSpider(Source):
        }
        return synonym
-    @staticmethod
+    def parse_extendedinfo(self, response):
    def parse_extendedinfo(response):
        """Scrape data from the ChemSpider GetExtendedCompoundInfo API"""
        sel = Selector(response)
        properties = []
        names = sel.xpath('*').xpath('name()').extract()
        values = sel.xpath('*').xpath('text()').extract()
        for (name, value) in zip(names, values):
-            result = Result({
+            result = self.newresult(
-                'attribute': name,
+                attribute=name,
-                'value': value,  # These values have no unit!
+                value=value,  # These values have no unit!
-                'source': 'ChemSpider ExtendedCompoundInfo',
+                source='ChemSpider ExtendedCompoundInfo',
-                'reliability': 'Unknown',
+            )
                'conditions': ''
            })
            if result['value']:
                properties.append(result)
        return properties
    def newresult(self, attribute, value, conditions='', source='ChemSpider'):
        return Result({
                'attribute': attribute,
                'value': value,
                'source': source,
                'reliability': self.cfg['reliability'],
                'conditions': conditions
                })
    def parse_searchrequest(self, response):
        """Parse the initial response of the ChemSpider Search API """
        sel = Selector(response)
@ -224,7 +229,7 @@ class ChemSpider(Source):
                        callback=self.parse_extendedinfo)]
    def new_compound_request(self, compound):
-        if compound in self.ignore_list:  # [TODO] - add regular expression
+        if compound in self.ignore_list or self.cfg['token'] == '':
            return None
        searchurl = self.website[:-1] + self.search % compound
        log.msg('chemspider compound', level=log.DEBUG)
--- a/FourmiCrawler/sources/NIST.py
+++ b/FourmiCrawler/sources/NIST.py
@ -22,10 +22,12 @@ class NIST(Source):
    search = 'cgi/cbook.cgi?Name=%s&Units=SI&cTP=on'
-    ignore_list = set()
+    cfg = {}
-    def __init__(self):
+    def __init__(self, config={}):
-        Source.__init__(self)
+        Source.__init__(self, config)
        self.ignore_list = set()
        self.cfg = config
    def parse(self, response):
        sel = Selector(response)
@ -114,13 +116,10 @@ class NIST(Source):
        requests = []
        for key, value in data.iteritems():
-            result = Result({
+            result = self.newresult(
-                'attribute': key,
+                attribute=key,
-                'value': value,
+                value=value
-                'source': 'NIST',
+            )
                'reliability': 'Unknown',
                'conditions': ''
            })
            requests.append(result)
        return requests
@ -150,19 +149,16 @@ class NIST(Source):
                name = m.group(1)
                condition = m.group(2)
-            result = Result({
+            result = self.newresult(
-                'attribute': name,
+                attribute=name,
-                'value': data[1] + ' ' + data[2],
+                value=data[1] + ' ' + data[2],
-                'source': 'NIST',
+                conditions=condition
-                'reliability': 'Unknown',
+            )
                'conditions': condition
            })
            log.msg('NIST: |%s|' % data, level=log.DEBUG)
            results.append(result)
        return results
-    @staticmethod
+    def parse_transition_data(self, table, summary):
    def parse_transition_data(table, summary):
        """Parses the table containing properties regarding phase changes"""
        results = []
@ -174,19 +170,16 @@ class NIST(Source):
        for tr in table.xpath('tr[td]'):
            tds = tr.xpath('td/text()').extract()
-            result = Result({
+            result = self.newresult(
-                'attribute': summary,
+                attribute=summary,
-                'value': tds[0] + ' ' + unit,
+                value=tds[0] + ' ' + unit,
-                'source': 'NIST',
+                conditions='%s K, (%s -> %s)' % (tds[1], tds[2], tds[3])
-                'reliability': 'Unknown',
+            )
                'conditions': '%s K, (%s -> %s)' % (tds[1], tds[2], tds[3])
            })
            results.append(result)
        return results
-    @staticmethod
+    def parse_generic_data(self, table, summary):
    def parse_generic_data(table, summary):
        """Parses the common tables of 4 and 5 rows. Assumes they are of the
        form:
        Symbol (unit)|Temperature (K)|Method|Reference|Comment
@ -202,36 +195,30 @@ class NIST(Source):
        for tr in table.xpath('tr[td]'):
            tds = tr.xpath('td/text()').extract()
-            result = Result({
+            result = self.newresult(
-                'attribute': summary,
+                attribute=summary,
-                'value': tds[0] + ' ' + unit,
+                value=tds[0] + ' ' + unit,
-                'source': 'NIST',
+                conditions='%s K' % tds[1]
-                'reliability': 'Unknown',
+            )
                'conditions': '%s K' % tds[1]
            })
            results.append(result)
        return results
-    @staticmethod
+    def parse_antoine_data(self, table, summary):
    def parse_antoine_data(table, summary):
        """Parse table containing parameters for the Antione equation"""
        results = []
        for tr in table.xpath('tr[td]'):
            tds = tr.xpath('td/text()').extract()
-            result = Result({
+            result = self.newresult(
-                'attribute': summary,
+                attribute=summary,
-                'value': 'A=%s, B=%s, C=%s' % (tds[1], tds[2], tds[3]),
+                value='A=%s, B=%s, C=%s' % (tds[1], tds[2], tds[3]),
-                'source': 'NIST',
+                conditions='%s K' % tds[0]
-                'reliability': 'Unknown',
+            )
                'conditions': '%s K' % tds[0]
            })
            results.append(result)
        return results
-    @staticmethod
+    def parse_individual_datapoints(self, response):
    def parse_individual_datapoints(response):
        """Parses the page linked from aggregate data"""
        sel = Selector(response)
        table = sel.xpath('//table[@class="data"]')[0]
@ -258,17 +245,24 @@ class NIST(Source):
            if m:
                uncertainty = '+- %s ' % m.group(1)
                # [TODO]: get the plusminus sign working in here
-            result = Result({
+            result = self.newresult(
-                'attribute': name,
+                attribute=name,
-                'value': '%s %s%s' % (tds[0], uncertainty, unit),
+                value='%s %s%s' % (tds[0], uncertainty, unit),
-                'source': 'NIST',
+                conditions=condition
-                'reliability': 'Unknown',
+            )
                'conditions': condition
            })
            results.append(result)
        return results
    def newresult(self, attribute, value, conditions=''):
        return Result({
            'attribute': attribute,
            'value': value,
            'source': 'NIST',
            'reliability': self.cfg['reliability'],
            'conditions': conditions
            })
    def new_compound_request(self, compound):
        if compound not in self.ignore_list:
            self.ignore_list.update(compound)
--- a/FourmiCrawler/sources/WikipediaParser.py
+++ b/FourmiCrawler/sources/WikipediaParser.py
@ -19,8 +19,11 @@ class WikipediaParser(Source):
    __spider = None
    searched_compounds = []
-    def __init__(self):
+    cfg = {}
-        Source.__init__(self)
+
    def __init__(self, config={}):
        Source.__init__(self, config)
        self.cfg = config
    def parse(self, response):
        """ Distributes the above described behaviour """
@ -44,13 +47,10 @@ class WikipediaParser(Source):
        prop_names = tr_list[::2]
        prop_values = tr_list[1::2]
        for i, prop_name in enumerate(prop_names):
-            item = Result({
+            item = self.newresult(
-                'attribute': prop_name.extract().encode('utf-8'),
+                attribute=prop_name.extract().encode('utf-8'),
-                'value': prop_values[i].extract().encode('utf-8'),
+                value=prop_values[i].extract().encode('utf-8')
-                'source': "Wikipedia",
+            )
                'reliability': "Unknown",
                'conditions': ""
            })
            items.append(item)
            log.msg('Wiki prop: |%s| |%s| |%s|' % (item['attribute'], item['value'], item['source']), level=log.DEBUG)
@ -61,13 +61,10 @@ class WikipediaParser(Source):
            log.msg('item: %s' % tablerow.xpath('./th').xpath('normalize-space(string())'), level=log.DEBUG)
            if tablerow.xpath('./th').xpath('normalize-space(string())') and tablerow.xpath('./td').xpath(
                    'normalize-space(string())'):
-                item = Result({
+                item = self.newresult(
-                    'attribute': tablerow.xpath('./th').xpath('normalize-space(string())').extract()[0].encode('utf-8'),
+                    attribute=tablerow.xpath('./th').xpath('normalize-space(string())').extract()[0].encode('utf-8'),
-                    'value': tablerow.xpath('./td').xpath('normalize-space(string())').extract()[0].encode('utf-8'),
+                    value=tablerow.xpath('./td').xpath('normalize-space(string())').extract()[0].encode('utf-8'),
-                    'source': "Wikipedia",
+                )
                    'reliability': "Unknown",
                    'conditions': ""
                })
                items.append(item)
                log.msg(
                    'Wiki prop: |attribute: %s| |value: %s| |%s|' % (item['attribute'], item['value'], item['source']),
@ -116,4 +113,13 @@ class WikipediaParser(Source):
        """ find external links, named 'Identifiers' to different sources. """
        links = sel.xpath('//span[contains(concat(" ",normalize-space(@class)," "),"reflink")]/a'
                          '[contains(concat(" ",normalize-space(@class)," "),"external")]/@href').extract()
-        return links
+        return links
    def newresult(self, attribute, value):
        return Result({
            'attribute': attribute,
            'value': value,
            'source': 'Wikipedia',
            'reliability': self.cfg['reliability'],
            'conditions': ''
            })
--- a/FourmiCrawler/sources/source.py
+++ b/FourmiCrawler/sources/source.py
@ -6,7 +6,7 @@ class Source:
    website = "http://something/*"  # Regex of URI's the source is able to parse
    _spider = None
-    def __init__(self):
+    def __init__(self, config={}):
        """
        Initiation of a new Source
        """
--- a/FourmiCrawler/spider.py
+++ b/FourmiCrawler/spider.py
@ -9,8 +9,6 @@ class FourmiSpider(Spider):
    A spider writen for the Fourmi Project which calls upon all available sources to request and scrape data.
    """
    name = "FourmiSpider"
    _sources = []
    synonyms = set()
    def __init__(self, compound=None, selected_attributes=[".*"], *args, **kwargs):
        """
@ -18,6 +16,8 @@ class FourmiSpider(Spider):
        :param compound: compound that will be searched.
        :param selected_attributes: A list of regular expressions that the attributes should match.
        """
        self._sources = []
        self.synonyms = set()
        super(FourmiSpider, self).__init__(*args, **kwargs)
        self.synonyms.add(compound)
        self.selected_attributes = selected_attributes
--- a/tests/test_configurator.py
+++ b/tests/test_configurator.py
@ -1,6 +1,7 @@
 import unittest
 from utils.configurator import Configurator
 import ConfigParser
 class TestConfigurator(unittest.TestCase):
@ -24,4 +25,26 @@ class TestConfigurator(unittest.TestCase):
    #     self.conf.start_log("test.log", True)
    #     self.conf.start_log("test.log", False)
    #     self.conf.start_log(None, True)
-    #     self.conf.start_log(None, False)
+    #     self.conf.start_log(None, False)
    def test_read_sourceconfiguration(self):
        config = self.conf.read_sourceconfiguration()
        self.assertIsInstance(config, ConfigParser.ConfigParser)
    def test_get_section(self):
        config = ConfigParser.ConfigParser()
        section = self.conf.get_section(config, 'test')
        self.assertIn('reliability', section)
        self.assertEquals(section['reliability'], '')
        config.set('DEFAULT', 'reliability', 'Low')
        section = self.conf.get_section(config, 'test')
        self.assertEquals(section['reliability'], 'Low')
        config.add_section('test')
        config.set('test', 'var', 'Maybe')
        section = self.conf.get_section(config, 'test')
        self.assertEquals(section['reliability'], 'Low')
        self.assertEqual(section['var'], 'Maybe')
--- a/tests/test_spider.py
+++ b/tests/test_spider.py
@ -3,7 +3,7 @@ import unittest
 from scrapy.http import Request
 from FourmiCrawler import spider
-from FourmiCrawler.sources.ChemSpider import ChemSpider
+from FourmiCrawler.sources.NIST import NIST
 from FourmiCrawler.sources.source import Source
@ -41,7 +41,7 @@ class TestFoumiSpider(unittest.TestCase):
        self.spi.add_source(src)
        self.assertEqual(self.spi.start_requests(), [])
-        src2 = ChemSpider()
+        src2 = NIST()
        self.spi.add_source(src2)
        requests = self.spi.start_requests()
        self.assertGreater(len(requests), 0)
@ -57,8 +57,8 @@ class TestFoumiSpider(unittest.TestCase):
        self.assertEqual(self.spi.get_synonym_requests("new_compound"), [])
        self.assertIn("new_compound", self.spi.synonyms)
-        src2 = ChemSpider()
+        src2 = NIST()
        self.spi.add_source(src2)
        self.assertIsInstance(self.spi.get_synonym_requests("other_compound")[0], Request)
        self.assertIn("other_compound", self.spi.synonyms)
-        self.assertEqual(self.spi.get_synonym_requests("other_compound"), [])
+        self.assertEqual(self.spi.get_synonym_requests("other_compound"), [])
--- a/utils/configurator.py
+++ b/utils/configurator.py
@ -1,6 +1,6 @@
 from scrapy import log
 from scrapy.utils.project import get_project_settings
-
+import ConfigParser
 class Configurator:
    """
@ -47,3 +47,35 @@ class Configurator:
                log.start(logstdout=False, loglevel=log.DEBUG)
            else:
                log.start(logstdout=True, loglevel=log.WARNING)
    @staticmethod
    def read_sourceconfiguration():
        """
        This function reads sources.cfg in the main folder for configuration
        variables for sources
        :return a ConfigParser object of sources.cfg
        """
        config = ConfigParser.ConfigParser()
        config.read('sources.cfg') # [TODO]: should be softcoded eventually
        return config
    @staticmethod
    def get_section(config, sourcename):
        """
        This function reads a config section labeled in variable sourcename and
        tests whether the reliability variable is set else set to empty string.
        Return the default section if the labeled config section does not exist
        :param config: a ConfigParser object
        :param sourcename: the name of the section to be read
        :return a dictionary of the section in the config labeled in sourcename
        """
        section = dict()
        if config.has_section(sourcename):
            section = dict(config.items(sourcename))
        elif config.defaults():
            section = config.defaults()
        if 'reliability' not in section:
            log.msg('Reliability not set for %s' % sourcename,
                    level=log.WARNING)
            section['reliability'] = ''
        return section
--- a/utils/sourceloader.py
+++ b/utils/sourceloader.py
@ -3,7 +3,7 @@ import os
 import re
 from FourmiCrawler.sources.source import Source
-
+from utils.configurator import Configurator
 class SourceLoader:
    sources = []
@ -11,18 +11,23 @@ class SourceLoader:
    def __init__(self, rel_dir="../FourmiCrawler/sources"):
        """
        The initiation of a SourceLoader, selects and indexes a directory for usable sources.
        Also loads a configuration file for Sources and passes the arguments in
        the named section to the source
        :param rel_dir: A relative path to a directory.
        """
        path = os.path.dirname(os.path.abspath(__file__))
        path += "/" + rel_dir
        known_parser = set()
        config = Configurator.read_sourceconfiguration()
        for py in [f[:-3] for f in os.listdir(path) if f.endswith('.py') and f != '__init__.py']:
            mod = __import__('.'.join([rel_dir.replace("../", "").replace("/", "."), py]), fromlist=[py])
            classes = [getattr(mod, x) for x in dir(mod) if inspect.isclass(getattr(mod, x))]
            for cls in classes:
                if issubclass(cls, Source) and cls not in known_parser:
-                    self.sources.append(cls())  # [review] - Would we ever need arguments for the parsers?
+                    sourcecfg = Configurator.get_section(config, cls.__name__)
                    self.sources.append(cls(sourcecfg))
                    known_parser.add(cls)
    def include(self, source_names):
@ -55,4 +60,4 @@ class SourceLoader:
            string += "Source: " + src.__class__.__name__
            string += " - "
            string += "URI: " + src.website + "\n"
-        return string
+        return string