Merge branch 'feature/sources-configuration' into develop

2014-06-08 22:36:02 +02:00 · 2014-06-08 22:36:02 +02:00 · 1ab7d0ba76
commit 1ab7d0ba76
parent efb7d60079 326413effa
10 changed files with 187 additions and 119 deletions
--- a/.gitignore
+++ b/.gitignore
@ -4,6 +4,9 @@
 #Python Specific ignores
 *.pyc

+#may contain authentication information
+sources.cfg
+
 #THINGS WE WOULD NEVER EVER WANT!
 #ignore thumbnails created by windows
 Thumbs.db
--- a/FourmiCrawler/sources/ChemSpider.py
+++ b/FourmiCrawler/sources/ChemSpider.py
@ -9,7 +9,7 @@ from FourmiCrawler.items import Result


 # [TODO] - Maybe clean up usage of '.extract()[0]', because of possible IndexError exception.
-
+# [TODO] - Add checks at search request and extendedCompoundInfo on whether the token was valid or not

 class ChemSpider(Source):
    """ChemSpider scraper for synonyms and properties
@ -20,19 +20,23 @@ class ChemSpider(Source):
    somewhere.
    """

-    def __init__(self):
-        Source.__init__(self)
-
    website = 'http://www.chemspider.com/*'

-    # [TODO] - Save and access token of specific user.
-    search = ('Search.asmx/SimpleSearch?query=%s&token='
-              '052bfd06-5ce4-43d6-bf12-89eabefd2338')
+    search = 'Search.asmx/SimpleSearch?query=%s&token='
    structure = 'Chemical-Structure.%s.html'
-    extendedinfo = ('MassSpecAPI.asmx/GetExtendedCompoundInfo?csid=%s&token='
-                    '052bfd06-5ce4-43d6-bf12-89eabefd2338')
+    extendedinfo = 'MassSpecAPI.asmx/GetExtendedCompoundInfo?csid=%s&token='
+
+    def __init__(self, config={}):
+        Source.__init__(self, config)
+        self.cfg = config
+        self.ignore_list = []
+        if 'token' not in self.cfg or self.cfg['token'] == '':
+            log.msg('ChemSpider token not set or empty, search/MassSpec API '
+                    'not available', level=log.WARNING)
+            self.cfg['token'] = ''
+        self.search += self.cfg['token']
+        self.extendedinfo += self.cfg['token']

-    ignore_list = []

    def parse(self, response):
        sel = Selector(response)
@ -44,8 +48,7 @@ class ChemSpider(Source):

        return requests

-    @staticmethod
-    def parse_properties(sel):
+    def parse_properties(self, sel):
        """scrape Experimental Data and Predicted ACD/Labs tabs"""
        properties = []

@ -76,13 +79,12 @@ class ChemSpider(Source):
                prop_value = m.group(1)
                prop_conditions = m.group(2)

-            new_prop = Result({
-                'attribute': prop_name,
-                'value': prop_value,
-                'source': 'ChemSpider Predicted - ACD/Labs Tab',
-                'reliability': 'Unknown',
-                'conditions': prop_conditions
-            })
+            new_prop = self.newresult(
+                attribute=prop_name,
+                value=prop_value,
+                source='ChemSpider Predicted - ACD/Labs Tab',
+                conditions=prop_conditions
+            )
            properties.append(new_prop)
            log.msg('CS prop: |%s| |%s| |%s|' %
                    (new_prop['attribute'], new_prop['value'], new_prop['source']),
@ -100,14 +102,11 @@ class ChemSpider(Source):
            if line.xpath('span/text()'):
                property_name = line.xpath('span/text()').extract()[0].rstrip()
            else:
-                new_prop = Result({
-                    'attribute': property_name[:-1],
-                    'value': line.xpath('text()').extract()[0].rstrip(),
-                    'source': line.xpath(
-                        'strong/text()').extract()[0].rstrip(),
-                    'reliability': 'Unknown',
-                    'conditions': ''
-                })
+                new_prop = self.newresult(
+                    attribute=property_name[:-1],
+                    value=line.xpath('text()').extract()[0].rstrip(),
+                    source=line.xpath('strong/text()').extract()[0].rstrip(),
+                )
                properties.append(new_prop)
                log.msg('CS prop: |%s| |%s| |%s|' %
                        (new_prop['attribute'], new_prop['value'],
@ -183,25 +182,31 @@ class ChemSpider(Source):
        }
        return synonym

-    @staticmethod
-    def parse_extendedinfo(response):
+    def parse_extendedinfo(self, response):
        """Scrape data from the ChemSpider GetExtendedCompoundInfo API"""
        sel = Selector(response)
        properties = []
        names = sel.xpath('*').xpath('name()').extract()
        values = sel.xpath('*').xpath('text()').extract()
        for (name, value) in zip(names, values):
-            result = Result({
-                'attribute': name,
-                'value': value,  # These values have no unit!
-                'source': 'ChemSpider ExtendedCompoundInfo',
-                'reliability': 'Unknown',
-                'conditions': ''
-            })
+            result = self.newresult(
+                attribute=name,
+                value=value,  # These values have no unit!
+                source='ChemSpider ExtendedCompoundInfo',
+            )
            if result['value']:
                properties.append(result)
        return properties

+    def newresult(self, attribute, value, conditions='', source='ChemSpider'):
+        return Result({
+                'attribute': attribute,
+                'value': value,
+                'source': source,
+                'reliability': self.cfg['reliability'],
+                'conditions': conditions
+                })
+
    def parse_searchrequest(self, response):
        """Parse the initial response of the ChemSpider Search API """
        sel = Selector(response)
@ -224,7 +229,7 @@ class ChemSpider(Source):
                        callback=self.parse_extendedinfo)]

    def new_compound_request(self, compound):
-        if compound in self.ignore_list:  # [TODO] - add regular expression
+        if compound in self.ignore_list or self.cfg['token'] == '':
            return None
        searchurl = self.website[:-1] + self.search % compound
        log.msg('chemspider compound', level=log.DEBUG)
--- a/FourmiCrawler/sources/NIST.py
+++ b/FourmiCrawler/sources/NIST.py
@ -22,10 +22,12 @@ class NIST(Source):

    search = 'cgi/cbook.cgi?Name=%s&Units=SI&cTP=on'

-    ignore_list = set()
+    cfg = {}

-    def __init__(self):
-        Source.__init__(self)
+    def __init__(self, config={}):
+        Source.__init__(self, config)
+        self.ignore_list = set()
+        self.cfg = config

    def parse(self, response):
        sel = Selector(response)
@ -114,13 +116,10 @@ class NIST(Source):

        requests = []
        for key, value in data.iteritems():
-            result = Result({
-                'attribute': key,
-                'value': value,
-                'source': 'NIST',
-                'reliability': 'Unknown',
-                'conditions': ''
-            })
+            result = self.newresult(
+                attribute=key,
+                value=value
+            )
            requests.append(result)

        return requests
@ -150,19 +149,16 @@ class NIST(Source):
                name = m.group(1)
                condition = m.group(2)

-            result = Result({
-                'attribute': name,
-                'value': data[1] + ' ' + data[2],
-                'source': 'NIST',
-                'reliability': 'Unknown',
-                'conditions': condition
-            })
+            result = self.newresult(
+                attribute=name,
+                value=data[1] + ' ' + data[2],
+                conditions=condition
+            )
            log.msg('NIST: |%s|' % data, level=log.DEBUG)
            results.append(result)
        return results

-    @staticmethod
-    def parse_transition_data(table, summary):
+    def parse_transition_data(self, table, summary):
        """Parses the table containing properties regarding phase changes"""
        results = []

@ -174,19 +170,16 @@ class NIST(Source):

        for tr in table.xpath('tr[td]'):
            tds = tr.xpath('td/text()').extract()
-            result = Result({
-                'attribute': summary,
-                'value': tds[0] + ' ' + unit,
-                'source': 'NIST',
-                'reliability': 'Unknown',
-                'conditions': '%s K, (%s -> %s)' % (tds[1], tds[2], tds[3])
-            })
+            result = self.newresult(
+                attribute=summary,
+                value=tds[0] + ' ' + unit,
+                conditions='%s K, (%s -> %s)' % (tds[1], tds[2], tds[3])
+            )
            results.append(result)

        return results

-    @staticmethod
-    def parse_generic_data(table, summary):
+    def parse_generic_data(self, table, summary):
        """Parses the common tables of 4 and 5 rows. Assumes they are of the
        form:
        Symbol (unit)|Temperature (K)|Method|Reference|Comment
@ -202,36 +195,30 @@ class NIST(Source):

        for tr in table.xpath('tr[td]'):
            tds = tr.xpath('td/text()').extract()
-            result = Result({
-                'attribute': summary,
-                'value': tds[0] + ' ' + unit,
-                'source': 'NIST',
-                'reliability': 'Unknown',
-                'conditions': '%s K' % tds[1]
-            })
+            result = self.newresult(
+                attribute=summary,
+                value=tds[0] + ' ' + unit,
+                conditions='%s K' % tds[1]
+            )
            results.append(result)
        return results

-    @staticmethod
-    def parse_antoine_data(table, summary):
+    def parse_antoine_data(self, table, summary):
        """Parse table containing parameters for the Antione equation"""
        results = []

        for tr in table.xpath('tr[td]'):
            tds = tr.xpath('td/text()').extract()
-            result = Result({
-                'attribute': summary,
-                'value': 'A=%s, B=%s, C=%s' % (tds[1], tds[2], tds[3]),
-                'source': 'NIST',
-                'reliability': 'Unknown',
-                'conditions': '%s K' % tds[0]
-            })
+            result = self.newresult(
+                attribute=summary,
+                value='A=%s, B=%s, C=%s' % (tds[1], tds[2], tds[3]),
+                conditions='%s K' % tds[0]
+            )
            results.append(result)

        return results

-    @staticmethod
-    def parse_individual_datapoints(response):
+    def parse_individual_datapoints(self, response):
        """Parses the page linked from aggregate data"""
        sel = Selector(response)
        table = sel.xpath('//table[@class="data"]')[0]
@ -258,17 +245,24 @@ class NIST(Source):
            if m:
                uncertainty = '+- %s ' % m.group(1)
                # [TODO]: get the plusminus sign working in here
-            result = Result({
-                'attribute': name,
-                'value': '%s %s%s' % (tds[0], uncertainty, unit),
-                'source': 'NIST',
-                'reliability': 'Unknown',
-                'conditions': condition
-            })
+            result = self.newresult(
+                attribute=name,
+                value='%s %s%s' % (tds[0], uncertainty, unit),
+                conditions=condition
+            )
            results.append(result)

        return results

+    def newresult(self, attribute, value, conditions=''):
+        return Result({
+            'attribute': attribute,
+            'value': value,
+            'source': 'NIST',
+            'reliability': self.cfg['reliability'],
+            'conditions': conditions
+            })
+
    def new_compound_request(self, compound):
        if compound not in self.ignore_list:
            self.ignore_list.update(compound)
--- a/FourmiCrawler/sources/WikipediaParser.py
+++ b/FourmiCrawler/sources/WikipediaParser.py
@ -19,8 +19,11 @@ class WikipediaParser(Source):
    __spider = None
    searched_compounds = []

-    def __init__(self):
-        Source.__init__(self)
+    cfg = {}
+
+    def __init__(self, config={}):
+        Source.__init__(self, config)
+        self.cfg = config

    def parse(self, response):
        """ Distributes the above described behaviour """
@ -44,13 +47,10 @@ class WikipediaParser(Source):
        prop_names = tr_list[::2]
        prop_values = tr_list[1::2]
        for i, prop_name in enumerate(prop_names):
-            item = Result({
-                'attribute': prop_name.extract().encode('utf-8'),
-                'value': prop_values[i].extract().encode('utf-8'),
-                'source': "Wikipedia",
-                'reliability': "Unknown",
-                'conditions': ""
-            })
+            item = self.newresult(
+                attribute=prop_name.extract().encode('utf-8'),
+                value=prop_values[i].extract().encode('utf-8')
+            )
            items.append(item)
            log.msg('Wiki prop: |%s| |%s| |%s|' % (item['attribute'], item['value'], item['source']), level=log.DEBUG)

@ -61,13 +61,10 @@ class WikipediaParser(Source):
            log.msg('item: %s' % tablerow.xpath('./th').xpath('normalize-space(string())'), level=log.DEBUG)
            if tablerow.xpath('./th').xpath('normalize-space(string())') and tablerow.xpath('./td').xpath(
                    'normalize-space(string())'):
-                item = Result({
-                    'attribute': tablerow.xpath('./th').xpath('normalize-space(string())').extract()[0].encode('utf-8'),
-                    'value': tablerow.xpath('./td').xpath('normalize-space(string())').extract()[0].encode('utf-8'),
-                    'source': "Wikipedia",
-                    'reliability': "Unknown",
-                    'conditions': ""
-                })
+                item = self.newresult(
+                    attribute=tablerow.xpath('./th').xpath('normalize-space(string())').extract()[0].encode('utf-8'),
+                    value=tablerow.xpath('./td').xpath('normalize-space(string())').extract()[0].encode('utf-8'),
+                )
                items.append(item)
                log.msg(
                    'Wiki prop: |attribute: %s| |value: %s| |%s|' % (item['attribute'], item['value'], item['source']),
@ -117,3 +114,12 @@ class WikipediaParser(Source):
        links = sel.xpath('//span[contains(concat(" ",normalize-space(@class)," "),"reflink")]/a'
                          '[contains(concat(" ",normalize-space(@class)," "),"external")]/@href').extract()
        return links
+
+    def newresult(self, attribute, value):
+        return Result({
+            'attribute': attribute,
+            'value': value,
+            'source': 'Wikipedia',
+            'reliability': self.cfg['reliability'],
+            'conditions': ''
+            })
--- a/FourmiCrawler/sources/source.py
+++ b/FourmiCrawler/sources/source.py
@ -6,7 +6,7 @@ class Source:
    website = "http://something/*"  # Regex of URI's the source is able to parse
    _spider = None

-    def __init__(self):
+    def __init__(self, config={}):
        """
        Initiation of a new Source
        """
--- a/FourmiCrawler/spider.py
+++ b/FourmiCrawler/spider.py
@ -9,8 +9,6 @@ class FourmiSpider(Spider):
    A spider writen for the Fourmi Project which calls upon all available sources to request and scrape data.
    """
    name = "FourmiSpider"
-    _sources = []
-    synonyms = set()

    def __init__(self, compound=None, selected_attributes=[".*"], *args, **kwargs):
        """
@ -18,6 +16,8 @@ class FourmiSpider(Spider):
        :param compound: compound that will be searched.
        :param selected_attributes: A list of regular expressions that the attributes should match.
        """
+        self._sources = []
+        self.synonyms = set()
        super(FourmiSpider, self).__init__(*args, **kwargs)
        self.synonyms.add(compound)
        self.selected_attributes = selected_attributes
--- a/tests/test_configurator.py
+++ b/tests/test_configurator.py
@ -1,6 +1,7 @@
 import unittest
 from utils.configurator import Configurator

+import ConfigParser

 class TestConfigurator(unittest.TestCase):

@ -25,3 +26,25 @@ class TestConfigurator(unittest.TestCase):
    #     self.conf.start_log("test.log", False)
    #     self.conf.start_log(None, True)
    #     self.conf.start_log(None, False)
+
+    def test_read_sourceconfiguration(self):
+        config = self.conf.read_sourceconfiguration()
+        self.assertIsInstance(config, ConfigParser.ConfigParser)
+
+    def test_get_section(self):
+        config = ConfigParser.ConfigParser()
+        section = self.conf.get_section(config, 'test')
+        self.assertIn('reliability', section)
+        self.assertEquals(section['reliability'], '')
+
+        config.set('DEFAULT', 'reliability', 'Low')
+
+        section = self.conf.get_section(config, 'test')
+        self.assertEquals(section['reliability'], 'Low')
+
+        config.add_section('test')
+        config.set('test', 'var', 'Maybe')
+
+        section = self.conf.get_section(config, 'test')
+        self.assertEquals(section['reliability'], 'Low')
+        self.assertEqual(section['var'], 'Maybe')
--- a/tests/test_spider.py
+++ b/tests/test_spider.py
@ -3,7 +3,7 @@ import unittest
 from scrapy.http import Request

 from FourmiCrawler import spider
-from FourmiCrawler.sources.ChemSpider import ChemSpider
+from FourmiCrawler.sources.NIST import NIST
 from FourmiCrawler.sources.source import Source


@ -41,7 +41,7 @@ class TestFoumiSpider(unittest.TestCase):
        self.spi.add_source(src)
        self.assertEqual(self.spi.start_requests(), [])

-        src2 = ChemSpider()
+        src2 = NIST()
        self.spi.add_source(src2)
        requests = self.spi.start_requests()
        self.assertGreater(len(requests), 0)
@ -57,7 +57,7 @@ class TestFoumiSpider(unittest.TestCase):
        self.assertEqual(self.spi.get_synonym_requests("new_compound"), [])
        self.assertIn("new_compound", self.spi.synonyms)

-        src2 = ChemSpider()
+        src2 = NIST()
        self.spi.add_source(src2)
        self.assertIsInstance(self.spi.get_synonym_requests("other_compound")[0], Request)
        self.assertIn("other_compound", self.spi.synonyms)
--- a/utils/configurator.py
+++ b/utils/configurator.py
@ -1,6 +1,6 @@
 from scrapy import log
 from scrapy.utils.project import get_project_settings
-
+import ConfigParser

 class Configurator:
    """
@ -47,3 +47,35 @@ class Configurator:
                log.start(logstdout=False, loglevel=log.DEBUG)
            else:
                log.start(logstdout=True, loglevel=log.WARNING)
+
+    @staticmethod
+    def read_sourceconfiguration():
+        """
+        This function reads sources.cfg in the main folder for configuration
+        variables for sources
+        :return a ConfigParser object of sources.cfg
+        """
+        config = ConfigParser.ConfigParser()
+        config.read('sources.cfg') # [TODO]: should be softcoded eventually
+        return config
+
+    @staticmethod
+    def get_section(config, sourcename):
+        """
+        This function reads a config section labeled in variable sourcename and
+        tests whether the reliability variable is set else set to empty string.
+        Return the default section if the labeled config section does not exist
+        :param config: a ConfigParser object
+        :param sourcename: the name of the section to be read
+        :return a dictionary of the section in the config labeled in sourcename
+        """
+        section = dict()
+        if config.has_section(sourcename):
+            section = dict(config.items(sourcename))
+        elif config.defaults():
+            section = config.defaults()
+        if 'reliability' not in section:
+            log.msg('Reliability not set for %s' % sourcename,
+                    level=log.WARNING)
+            section['reliability'] = ''
+        return section
--- a/utils/sourceloader.py
+++ b/utils/sourceloader.py
@ -3,7 +3,7 @@ import os
 import re

 from FourmiCrawler.sources.source import Source
-
+from utils.configurator import Configurator

 class SourceLoader:
    sources = []
@ -11,18 +11,23 @@ class SourceLoader:
    def __init__(self, rel_dir="../FourmiCrawler/sources"):
        """
        The initiation of a SourceLoader, selects and indexes a directory for usable sources.
+        Also loads a configuration file for Sources and passes the arguments in
+        the named section to the source
        :param rel_dir: A relative path to a directory.
        """
        path = os.path.dirname(os.path.abspath(__file__))
        path += "/" + rel_dir
        known_parser = set()

+        config = Configurator.read_sourceconfiguration()
+
        for py in [f[:-3] for f in os.listdir(path) if f.endswith('.py') and f != '__init__.py']:
            mod = __import__('.'.join([rel_dir.replace("../", "").replace("/", "."), py]), fromlist=[py])
            classes = [getattr(mod, x) for x in dir(mod) if inspect.isclass(getattr(mod, x))]
            for cls in classes:
                if issubclass(cls, Source) and cls not in known_parser:
-                    self.sources.append(cls())  # [review] - Would we ever need arguments for the parsers?
+                    sourcecfg = Configurator.get_section(config, cls.__name__)
+                    self.sources.append(cls(sourcecfg))
                    known_parser.add(cls)

    def include(self, source_names):