trying to merge pubchem with develop cause fourmi on my current branch was broken

2014-06-11 17:36:17 +02:00 · 2014-06-11 17:36:17 +02:00 · 830b785b0a
commit 830b785b0a
parent a903e78f9e a1859f2ec2
15 changed files with 362 additions and 202 deletions
--- a/.gitignore
+++ b/.gitignore
@ -4,6 +4,9 @@
 #Python Specific ignores
 *.pyc
 #may contain authentication information
 sources.cfg
 #THINGS WE WOULD NEVER EVER WANT!
 #ignore thumbnails created by windows
 Thumbs.db
--- a/.travis.yml
+++ b/.travis.yml
@ -6,10 +6,14 @@ python: 2.7
 # command to install dependencies, e.g. pip install -r requirements.txt --use-mirrors
 install:
  - pip install Scrapy docopt
  - pip install coveralls
 # command to run tests, e.g. python setup.py test
 script:
-  - nosetests tests
+  - nosetests --with-coverage --cover-package=FourmiCrawler,utils tests
 notifications:
-  slack: descartes2:6sgCzx3PvrO9IIMwKxj12dDM
+  slack: descartes2:6sgCzx3PvrO9IIMwKxj12dDM
 after_success:
  coveralls --verbose
--- a/FourmiCrawler/sources/ChemSpider.py
+++ b/FourmiCrawler/sources/ChemSpider.py
@ -9,7 +9,7 @@ from FourmiCrawler.items import Result
 # [TODO] - Maybe clean up usage of '.extract()[0]', because of possible IndexError exception.
-
+# [TODO] - Add checks at search request and extendedCompoundInfo on whether the token was valid or not
 class ChemSpider(Source):
    """ChemSpider scraper for synonyms and properties
@ -20,19 +20,23 @@ class ChemSpider(Source):
    somewhere.
    """
    def __init__(self):
        Source.__init__(self)
    website = 'http://www.chemspider.com/*'
-    # [TODO] - Save and access token of specific user.
+    search = 'Search.asmx/SimpleSearch?query=%s&token='
    search = ('Search.asmx/SimpleSearch?query=%s&token='
              '052bfd06-5ce4-43d6-bf12-89eabefd2338')
    structure = 'Chemical-Structure.%s.html'
-    extendedinfo = ('MassSpecAPI.asmx/GetExtendedCompoundInfo?csid=%s&token='
+    extendedinfo = 'MassSpecAPI.asmx/GetExtendedCompoundInfo?csid=%s&token='
-                    '052bfd06-5ce4-43d6-bf12-89eabefd2338')
+
    def __init__(self, config={}):
        Source.__init__(self, config)
        self.cfg = config
        self.ignore_list = []
        if 'token' not in self.cfg or self.cfg['token'] == '':
            log.msg('ChemSpider token not set or empty, search/MassSpec API '
                    'not available', level=log.WARNING)
            self.cfg['token'] = ''
        self.search += self.cfg['token']
        self.extendedinfo += self.cfg['token']
    ignore_list = []
    def parse(self, response):
        sel = Selector(response)
@ -44,8 +48,7 @@ class ChemSpider(Source):
        return requests
-    @staticmethod
+    def parse_properties(self, sel):
    def parse_properties(sel):
        """scrape Experimental Data and Predicted ACD/Labs tabs"""
        properties = []
@ -76,13 +79,12 @@ class ChemSpider(Source):
                prop_value = m.group(1)
                prop_conditions = m.group(2)
-            new_prop = Result({
+            new_prop = self.newresult(
-                'attribute': prop_name,
+                attribute=prop_name,
-                'value': prop_value,
+                value=prop_value,
-                'source': 'ChemSpider Predicted - ACD/Labs Tab',
+                source='ChemSpider Predicted - ACD/Labs Tab',
-                'reliability': 'Unknown',
+                conditions=prop_conditions
-                'conditions': prop_conditions
+            )
            })
            properties.append(new_prop)
            log.msg('CS prop: |%s| |%s| |%s|' %
                    (new_prop['attribute'], new_prop['value'], new_prop['source']),
@ -100,14 +102,11 @@ class ChemSpider(Source):
            if line.xpath('span/text()'):
                property_name = line.xpath('span/text()').extract()[0].rstrip()
            else:
-                new_prop = Result({
+                new_prop = self.newresult(
-                    'attribute': property_name[:-1],
+                    attribute=property_name[:-1],
-                    'value': line.xpath('text()').extract()[0].rstrip(),
+                    value=line.xpath('text()').extract()[0].rstrip(),
-                    'source': line.xpath(
+                    source=line.xpath('strong/text()').extract()[0].rstrip(),
-                        'strong/text()').extract()[0].rstrip(),
+                )
                    'reliability': 'Unknown',
                    'conditions': ''
                })
                properties.append(new_prop)
                log.msg('CS prop: |%s| |%s| |%s|' %
                        (new_prop['attribute'], new_prop['value'],
@ -183,25 +182,31 @@ class ChemSpider(Source):
        }
        return synonym
-    @staticmethod
+    def parse_extendedinfo(self, response):
    def parse_extendedinfo(response):
        """Scrape data from the ChemSpider GetExtendedCompoundInfo API"""
        sel = Selector(response)
        properties = []
        names = sel.xpath('*').xpath('name()').extract()
        values = sel.xpath('*').xpath('text()').extract()
        for (name, value) in zip(names, values):
-            result = Result({
+            result = self.newresult(
-                'attribute': name,
+                attribute=name,
-                'value': value,  # These values have no unit!
+                value=value,  # These values have no unit!
-                'source': 'ChemSpider ExtendedCompoundInfo',
+                source='ChemSpider ExtendedCompoundInfo',
-                'reliability': 'Unknown',
+            )
                'conditions': ''
            })
            if result['value']:
                properties.append(result)
        return properties
    def newresult(self, attribute, value, conditions='', source='ChemSpider'):
        return Result({
                'attribute': attribute,
                'value': value,
                'source': source,
                'reliability': self.cfg['reliability'],
                'conditions': conditions
                })
    def parse_searchrequest(self, response):
        """Parse the initial response of the ChemSpider Search API """
        sel = Selector(response)
@ -224,7 +229,7 @@ class ChemSpider(Source):
                        callback=self.parse_extendedinfo)]
    def new_compound_request(self, compound):
-        if compound in self.ignore_list:  # [TODO] - add regular expression
+        if compound in self.ignore_list or self.cfg['token'] == '':
            return None
        searchurl = self.website[:-1] + self.search % compound
        log.msg('chemspider compound', level=log.DEBUG)
--- a/FourmiCrawler/sources/NIST.py
+++ b/FourmiCrawler/sources/NIST.py
@ -22,10 +22,12 @@ class NIST(Source):
    search = 'cgi/cbook.cgi?Name=%s&Units=SI&cTP=on'
-    ignore_list = set()
+    cfg = {}
-    def __init__(self):
+    def __init__(self, config={}):
-        Source.__init__(self)
+        Source.__init__(self, config)
        self.ignore_list = set()
        self.cfg = config
    def parse(self, response):
        sel = Selector(response)
@ -114,13 +116,10 @@ class NIST(Source):
        requests = []
        for key, value in data.iteritems():
-            result = Result({
+            result = self.newresult(
-                'attribute': key,
+                attribute=key,
-                'value': value,
+                value=value
-                'source': 'NIST',
+            )
                'reliability': 'Unknown',
                'conditions': ''
            })
            requests.append(result)
        return requests
@ -150,19 +149,16 @@ class NIST(Source):
                name = m.group(1)
                condition = m.group(2)
-            result = Result({
+            result = self.newresult(
-                'attribute': name,
+                attribute=name,
-                'value': data[1] + ' ' + data[2],
+                value=data[1] + ' ' + data[2],
-                'source': 'NIST',
+                conditions=condition
-                'reliability': 'Unknown',
+            )
                'conditions': condition
            })
            log.msg('NIST: |%s|' % data, level=log.DEBUG)
            results.append(result)
        return results
-    @staticmethod
+    def parse_transition_data(self, table, summary):
    def parse_transition_data(table, summary):
        """Parses the table containing properties regarding phase changes"""
        results = []
@ -174,19 +170,16 @@ class NIST(Source):
        for tr in table.xpath('tr[td]'):
            tds = tr.xpath('td/text()').extract()
-            result = Result({
+            result = self.newresult(
-                'attribute': summary,
+                attribute=summary,
-                'value': tds[0] + ' ' + unit,
+                value=tds[0] + ' ' + unit,
-                'source': 'NIST',
+                conditions='%s K, (%s -> %s)' % (tds[1], tds[2], tds[3])
-                'reliability': 'Unknown',
+            )
                'conditions': '%s K, (%s -> %s)' % (tds[1], tds[2], tds[3])
            })
            results.append(result)
        return results
-    @staticmethod
+    def parse_generic_data(self, table, summary):
    def parse_generic_data(table, summary):
        """Parses the common tables of 4 and 5 rows. Assumes they are of the
        form:
        Symbol (unit)|Temperature (K)|Method|Reference|Comment
@ -202,36 +195,30 @@ class NIST(Source):
        for tr in table.xpath('tr[td]'):
            tds = tr.xpath('td/text()').extract()
-            result = Result({
+            result = self.newresult(
-                'attribute': summary,
+                attribute=summary,
-                'value': tds[0] + ' ' + unit,
+                value=tds[0] + ' ' + unit,
-                'source': 'NIST',
+                conditions='%s K' % tds[1]
-                'reliability': 'Unknown',
+            )
                'conditions': '%s K' % tds[1]
            })
            results.append(result)
        return results
-    @staticmethod
+    def parse_antoine_data(self, table, summary):
    def parse_antoine_data(table, summary):
        """Parse table containing parameters for the Antione equation"""
        results = []
        for tr in table.xpath('tr[td]'):
            tds = tr.xpath('td/text()').extract()
-            result = Result({
+            result = self.newresult(
-                'attribute': summary,
+                attribute=summary,
-                'value': 'A=%s, B=%s, C=%s' % (tds[1], tds[2], tds[3]),
+                value='A=%s, B=%s, C=%s' % (tds[1], tds[2], tds[3]),
-                'source': 'NIST',
+                conditions='%s K' % tds[0]
-                'reliability': 'Unknown',
+            )
                'conditions': '%s K' % tds[0]
            })
            results.append(result)
        return results
-    @staticmethod
+    def parse_individual_datapoints(self, response):
    def parse_individual_datapoints(response):
        """Parses the page linked from aggregate data"""
        sel = Selector(response)
        table = sel.xpath('//table[@class="data"]')[0]
@ -258,17 +245,24 @@ class NIST(Source):
            if m:
                uncertainty = '+- %s ' % m.group(1)
                # [TODO]: get the plusminus sign working in here
-            result = Result({
+            result = self.newresult(
-                'attribute': name,
+                attribute=name,
-                'value': '%s %s%s' % (tds[0], uncertainty, unit),
+                value='%s %s%s' % (tds[0], uncertainty, unit),
-                'source': 'NIST',
+                conditions=condition
-                'reliability': 'Unknown',
+            )
                'conditions': condition
            })
            results.append(result)
        return results
    def newresult(self, attribute, value, conditions=''):
        return Result({
            'attribute': attribute,
            'value': value,
            'source': 'NIST',
            'reliability': self.cfg['reliability'],
            'conditions': conditions
            })
    def new_compound_request(self, compound):
        if compound not in self.ignore_list:
            self.ignore_list.update(compound)
--- a/FourmiCrawler/sources/WikipediaParser.py
+++ b/FourmiCrawler/sources/WikipediaParser.py
@ -1,29 +1,34 @@
 import re
 from scrapy.http import Request
 from scrapy import log
 from scrapy.selector import Selector
 from source import Source
 from scrapy.selector import Selector
 from FourmiCrawler.items import Result
 import re
 class WikipediaParser(Source):
    """ Wikipedia scraper for chemical properties
    This parser parses Wikipedia info boxes (also bordered) to obtain properties and their values.
-     It also returns requests with other external sources which contain information on parsed subject.
+    It also returns requests with other external sources which contain information on parsed subject.
    """
    website = "http://en.wikipedia.org/wiki/*"
    __spider = None
    searched_compounds = []
-    def __init__(self):
+    cfg = {}
-        Source.__init__(self)
+
    def __init__(self, config={}):
        Source.__init__(self, config)
        self.cfg = config
    def parse(self, response):
-        """ Distributes the above described behaviour """
+        """
        Distributes the above described behaviour
        :param response: The incoming search request
        :return: Returns the found properties if response is unique or returns none if it's already known
        """
        log.msg('A response from %s just arrived!' % response.url, level=log.DEBUG)
        sel = Selector(response)
        compound = sel.xpath('//h1[@id="firstHeading"]//span/text()').extract()[0]  # makes sure to use main page
@ -35,43 +40,21 @@ class WikipediaParser(Source):
            return items
    def parse_infobox(self, sel):
-        """ scrape data from infobox on wikipedia. """
+        """
        Scrape data from infobox on wikipedia.
        Data from two types of infoboxes: class="infobox bordered" and class="infobox" is scraped and
        :param sel: The selector with the html-information of the page to parse
        :return: item_list: Returns a list of properties with their values, source, etc..
        """
        items = []
-        # be sure to get chembox (wikipedia template)
+        # scrape the chembox (wikipedia template)
-        tr_list = sel.xpath('.//table[@class="infobox bordered"]//td[not(@colspan)]'). \
+        items = self.parse_chembox(sel, items)
            xpath('normalize-space(string())')
        prop_names = tr_list[::2]
        prop_values = tr_list[1::2]
        for i, prop_name in enumerate(prop_names):
            item = Result({
                'attribute': prop_name.extract().encode('utf-8'),
                'value': prop_values[i].extract().encode('utf-8'),
                'source': "Wikipedia",
                'reliability': "Unknown",
                'conditions': ""
            })
            items.append(item)
            log.msg('Wiki prop: |%s| |%s| |%s|' % (item['attribute'], item['value'], item['source']), level=log.DEBUG)
-        #scrape the  drugbox (wikipedia template)
+        #scrape the drugbox (wikipedia template)
-        tr_list2 = sel.xpath('.//table[@class="infobox"]//tr')
+        items = self.parse_drugbox(sel, items)
        log.msg('dit: %s' % tr_list2, level=log.DEBUG)
        for tablerow in tr_list2:
            log.msg('item: %s' % tablerow.xpath('./th').xpath('normalize-space(string())'), level=log.DEBUG)
            if tablerow.xpath('./th').xpath('normalize-space(string())') and tablerow.xpath('./td').xpath(
                    'normalize-space(string())'):
                item = Result({
                    'attribute': tablerow.xpath('./th').xpath('normalize-space(string())').extract()[0].encode('utf-8'),
                    'value': tablerow.xpath('./td').xpath('normalize-space(string())').extract()[0].encode('utf-8'),
                    'source': "Wikipedia",
                    'reliability': "Unknown",
                    'conditions': ""
                })
                items.append(item)
                log.msg(
                    'Wiki prop: |attribute: %s| |value: %s| |%s|' % (item['attribute'], item['value'], item['source']),
                    level=log.DEBUG)
        items = filter(lambda a: a['value'] != '', items)  # remove items with an empty value
        item_list = self.clean_items(items)
@ -95,12 +78,66 @@ class WikipediaParser(Source):
        return item_list
    def parse_chembox(self, sel, items):
        """
        Scrape data from chembox infobox on wikipedia.
        :param sel: The selector with the html-information of the page to parse
        :param items: the list of items where the result have to be stored in
        :return: items: the list of items with the new found and stored items
        """
        tr_list = sel.xpath('.//table[@class="infobox bordered"]//td[not(@colspan)]'). \
            xpath('normalize-space(string())')
        prop_names = tr_list[::2]
        prop_values = tr_list[1::2]
        for i, prop_name in enumerate(prop_names):
            item = self.newresult(
                attribute=prop_name.extract().encode('utf-8'),
                value=prop_values[i].extract().encode('utf-8')
            )
            items.append(item)
            log.msg('Wiki prop: |%s| |%s| |%s|' % (item['attribute'], item['value'], item['source']), level=log.DEBUG)
        return items
    def parse_drugbox(self, sel, items):
        """
        Scrape data from drugbox infobox on wikipedia.
        :param sel: The selector with the html-information of the page to parse
        :param items: the list of items where the result have to be stored in
        :return: items: the list of items with the new found and stored items
        """
        tr_list2 = sel.xpath('.//table[@class="infobox"]//tr')
        log.msg('dit: %s' % tr_list2, level=log.DEBUG)
        for tablerow in tr_list2:
            log.msg('item: %s' % tablerow.xpath('./th').xpath('normalize-space(string())'), level=log.DEBUG)
            if tablerow.xpath('./th').xpath('normalize-space(string())') and tablerow.xpath('./td').xpath(
                    'normalize-space(string())'):
                item = self.newresult(
                    attribute=tablerow.xpath('./th').xpath('normalize-space(string())').extract()[0].encode('utf-8'),
                    value=tablerow.xpath('./td').xpath('normalize-space(string())').extract()[0].encode('utf-8'),
                )
                items.append(item)
                log.msg(
                    'Wiki prop: |attribute: %s| |value: %s| |%s|' % (item['attribute'], item['value'], item['source']),
                    level=log.DEBUG)
        return items
    def new_compound_request(self, compound):
        return Request(url=self.website[:-1] + compound, callback=self.parse)
    @staticmethod
    def clean_items(items):
-        """ clean up properties using regex, makes it possible to split the values from the units """
+
        """
        Clean up properties using regex, makes it possible to split the values from the units
        Almost not in use, only cleans J/K/mol values and boiling/melting points.
        :param items: List of properties with their values, source, etc..
        :return: items: List of now cleaned up items
        """
        for item in items:
            value = item['value']
            m = re.search('F;\s(\d+[\.,]?\d*)', value)  # clean up numerical Kelvin value (after F)
@ -113,7 +150,21 @@ class WikipediaParser(Source):
    @staticmethod
    def get_identifiers(sel):
-        """ find external links, named 'Identifiers' to different sources. """
+        """
        Find external links, named 'Identifiers' to different sources.
        :param sel: The selector with the html-information of the page to parse
        :return: links: New links which can be used to expand the crawlers search
        """
        links = sel.xpath('//span[contains(concat(" ",normalize-space(@class)," "),"reflink")]/a'
                          '[contains(concat(" ",normalize-space(@class)," "),"external")]/@href').extract()
-        return links
+        return links
    def newresult(self, attribute, value):
        return Result({
            'attribute': attribute,
            'value': value,
            'source': 'Wikipedia',
            'reliability': self.cfg['reliability'],
            'conditions': ''
            })
--- a/FourmiCrawler/sources/source.py
+++ b/FourmiCrawler/sources/source.py
@ -6,7 +6,7 @@ class Source:
    website = "http://something/*"  # Regex of URI's the source is able to parse
    _spider = None
-    def __init__(self):
+    def __init__(self, config={}):
        """
        Initiation of a new Source
        """
--- a/FourmiCrawler/spider.py
+++ b/FourmiCrawler/spider.py
@ -9,8 +9,6 @@ class FourmiSpider(Spider):
    A spider writen for the Fourmi Project which calls upon all available sources to request and scrape data.
    """
    name = "FourmiSpider"
    _sources = []
    synonyms = set()
    def __init__(self, compound=None, selected_attributes=[".*"], *args, **kwargs):
        """
@ -18,6 +16,8 @@ class FourmiSpider(Spider):
        :param compound: compound that will be searched.
        :param selected_attributes: A list of regular expressions that the attributes should match.
        """
        self._sources = []
        self.synonyms = set()
        super(FourmiSpider, self).__init__(*args, **kwargs)
        self.synonyms.add(compound)
        self.selected_attributes = selected_attributes
@ -35,14 +35,14 @@ class FourmiSpider(Spider):
                return source.parse(response)
        return None
-    def get_synonym_requests(self, compound):
+    def get_synonym_requests(self, compound, force=False):
        """
        A function that generates new Scrapy Request for each source given a new synonym of a compound.
        :param compound: A compound name
        :return: A list of Scrapy Request objects
        """
        requests = []
-        if compound not in self.synonyms:
+        if force or compound not in self.synonyms:
            self.synonyms.add(compound)
            for parser in self._sources:
                parser_requests = parser.new_compound_request(compound)
@ -57,7 +57,7 @@ class FourmiSpider(Spider):
        """
        requests = []
        for synonym in self.synonyms:
-            requests.extend(self.get_synonym_requests(synonym))
+            requests.extend(self.get_synonym_requests(synonym, force=True))
        return requests
    def add_sources(self, sources):
--- a/README.md
+++ b/README.md
@ -1,8 +1,8 @@
 # Fourmi
-**Master branch**: [![Build Status](https://travis-ci.org/Recondor/Fourmi.svg?branch=master)](https://travis-ci.org/Recondor/Fourmi)
+**Master branch**: [![Build Status](https://travis-ci.org/jjdekker/Fourmi.svg?branch=master)](https://travis-ci.org/jjdekker/Fourmi)  [![Coverage Status](https://img.shields.io/coveralls/jjdekker/Fourmi.svg)](https://coveralls.io/r/jjdekker/Fourmi?branch=master)
-**Developing branch**: [![Build Status](https://travis-ci.org/Recondor/Fourmi.svg?branch=develop)](https://travis-ci.org/Recondor/Fourmi)
+**Developing branch**: [![Build Status](https://travis-ci.org/jjdekker/Fourmi.svg?branch=develop)](https://travis-ci.org/jjdekker/Fourmi)  [![Coverage Status](https://img.shields.io/coveralls/jjdekker/Fourmi.svg)](https://coveralls.io/r/jjdekker/Fourmi?branch=develop)
 Fourmi is an web scraper for chemical substances. The program is designed to be
 used as a search engine to search multiple chemical databases for a specific
--- a/fourmi.py
+++ b/fourmi.py
@ -1,4 +1,4 @@
-# !/usr/bin/env python
+#!/usr/bin/env python
 """
 Fourmi, a web scraper build to search specific information for a given compound (and it's pseudonyms).
@ -17,8 +17,8 @@ Options:
    --version                       Show version.
    --verbose                       Verbose logging output.
    --log=<file>                    Save log to an file.
-    -o <file> --output=<file>       Output file [default: result.*format*]
+    -o <file> --output=<file>       Output file [default: results.*format*]
-    -f <format> --format=<format>   Output formats (supported: csv, json, jsonlines, xml) [default: jsonlines]
+    -f <format> --format=<format>   Output formats (supported: csv, json, jsonlines, xml) [default: csv]
    --include=<regex>               Include only sources that match these regular expressions split by a comma.
    --exclude=<regex>               Exclude the sources that match these regular expressions split by a comma.
 """
@ -30,7 +30,8 @@ from scrapy.utils.project import get_project_settings
 import docopt
 from FourmiCrawler.spider import FourmiSpider
-from sourceloader import SourceLoader
+from utils.configurator import Configurator
 from utils.sourceloader import SourceLoader
 def setup_crawler(compound, settings, source_loader, attributes):
@ -50,59 +51,22 @@ def setup_crawler(compound, settings, source_loader, attributes):
    crawler.start()
 def scrapy_settings_manipulation(docopt_arguments):
    """
    This function manipulates the Scrapy settings that normally would be set in the settings file. In the Fourmi
    project these are command line arguments.
    :param docopt_arguments: A dictionary generated by docopt containing all CLI arguments.
    """
    settings = get_project_settings()
    if docopt_arguments["--output"] != 'result.*format*':
        settings.overrides["FEED_URI"] = docopt_arguments["--output"]
    elif docopt_arguments["--format"] == "jsonlines":
        settings.overrides["FEED_URI"] = "results.json"
    elif docopt_arguments["--format"] is not None:
        settings.overrides["FEED_URI"] = "results." + docopt_arguments["--format"]
    if docopt_arguments["--format"] is not None:
        settings.overrides["FEED_FORMAT"] = docopt_arguments["--format"]
    return settings
 def start_log(docopt_arguments):
    """
    This function starts the logging functionality of Scrapy using the settings given by the CLI.
    :param docopt_arguments:  A dictionary generated by docopt containing all CLI arguments.
    """
    if docopt_arguments["--log"] is not None:
        if docopt_arguments["--verbose"]:
            log.start(logfile=docopt_arguments["--log"], logstdout=False, loglevel=log.DEBUG)
        else:
            log.start(logfile=docopt_arguments["--log"], logstdout=True, loglevel=log.WARNING)
    else:
        if docopt_arguments["--verbose"]:
            log.start(logstdout=False, loglevel=log.DEBUG)
        else:
            log.start(logstdout=True, loglevel=log.WARNING)
 def search(docopt_arguments, source_loader):
    """
    The function that facilitates the search for a specific compound.
    :param docopt_arguments: A dictionary generated by docopt containing all CLI arguments.
    :param source_loader: An initiated SourceLoader object pointed at the directory with the sources.
    """
-    start_log(docopt_arguments)
+    conf = Configurator()
-    settings = scrapy_settings_manipulation(docopt_arguments)
+    conf.start_log(docopt_arguments["--log"], docopt_arguments["--verbose"])
-    setup_crawler(docopt_arguments["<compound>"], settings, source_loader, docopt_arguments["--attributes"].split(','))
+    conf.set_output(docopt_arguments["--output"], docopt_arguments["--format"])
    setup_crawler(docopt_arguments["<compound>"], conf.scrapy_settings, source_loader, docopt_arguments["--attributes"].split(','))
    reactor.run()
 # The start for the Fourmi Command Line interface.
 if __name__ == '__main__':
-    arguments = docopt.docopt(__doc__, version='Fourmi - V0.4.1')
+    arguments = docopt.docopt(__doc__, version='Fourmi - V0.5.0')
    loader = SourceLoader()
    if arguments["--include"]:
--- a/tests/test_configurator.py
+++ b/tests/test_configurator.py
@ -0,0 +1,50 @@
 import unittest
 from utils.configurator import Configurator
 import ConfigParser
 class TestConfigurator(unittest.TestCase):
    def setUp(self):
        self.conf = Configurator()
    def test_set_output(self):
        self.conf.set_output(filename="test.txt", fileformat="csv")
        self.assertEqual(self.conf.scrapy_settings["FEED_URI"], "test.txt")
        self.assertEqual(self.conf.scrapy_settings["FEED_FORMAT"], "csv")
        self.conf.set_output("results.*format*", "jsonlines")
        self.assertEqual(self.conf.scrapy_settings["FEED_URI"], "results.json")
        self.assertEqual(self.conf.scrapy_settings["FEED_FORMAT"], "jsonlines")
        self.conf.set_output("results.*format*", "csv")
        self.assertEqual(self.conf.scrapy_settings["FEED_URI"], "results.csv")
        self.assertEqual(self.conf.scrapy_settings["FEED_FORMAT"], "csv")
    # def test_start_log(self):
    #     self.conf.start_log("test.log", True)
    #     self.conf.start_log("test.log", False)
    #     self.conf.start_log(None, True)
    #     self.conf.start_log(None, False)
    def test_read_sourceconfiguration(self):
        config = self.conf.read_sourceconfiguration()
        self.assertIsInstance(config, ConfigParser.ConfigParser)
    def test_get_section(self):
        config = ConfigParser.ConfigParser()
        section = self.conf.get_section(config, 'test')
        self.assertIn('reliability', section)
        self.assertEquals(section['reliability'], '')
        config.set('DEFAULT', 'reliability', 'Low')
        section = self.conf.get_section(config, 'test')
        self.assertEquals(section['reliability'], 'Low')
        config.add_section('test')
        config.set('test', 'var', 'Maybe')
        section = self.conf.get_section(config, 'test')
        self.assertEquals(section['reliability'], 'Low')
        self.assertEqual(section['var'], 'Maybe')
--- a/tests/test_sourceloader.py
+++ b/tests/test_sourceloader.py
@ -1,6 +1,6 @@
 import unittest
-from sourceloader import SourceLoader
+from utils.sourceloader import SourceLoader
 class TestSourceloader(unittest.TestCase):
--- a/tests/test_spider.py
+++ b/tests/test_spider.py
@ -3,7 +3,7 @@ import unittest
 from scrapy.http import Request
 from FourmiCrawler import spider
-from FourmiCrawler.sources.ChemSpider import ChemSpider
+from FourmiCrawler.sources.NIST import NIST
 from FourmiCrawler.sources.source import Source
@ -41,9 +41,12 @@ class TestFoumiSpider(unittest.TestCase):
        self.spi.add_source(src)
        self.assertEqual(self.spi.start_requests(), [])
-        src2 = ChemSpider()
+        src2 = NIST()
        self.spi.add_source(src2)
-        self.assertIsNotNone(self.spi.start_requests())
+        requests = self.spi.start_requests()
        self.assertGreater(len(requests), 0)
        self.assertIsInstance(requests[0], Request)
    def test_synonym_requests(self):
        # A test for the synonym request function
@ -54,8 +57,8 @@ class TestFoumiSpider(unittest.TestCase):
        self.assertEqual(self.spi.get_synonym_requests("new_compound"), [])
        self.assertIn("new_compound", self.spi.synonyms)
-        src2 = ChemSpider()
+        src2 = NIST()
        self.spi.add_source(src2)
        self.assertIsInstance(self.spi.get_synonym_requests("other_compound")[0], Request)
        self.assertIn("other_compound", self.spi.synonyms)
-        self.assertEqual(self.spi.get_synonym_requests("other_compound"), [])
+        self.assertEqual(self.spi.get_synonym_requests("other_compound"), [])
--- a/utils/init.py
+++ b/utils/init.py
--- a/utils/configurator.py
+++ b/utils/configurator.py
@ -0,0 +1,81 @@
 from scrapy import log
 from scrapy.utils.project import get_project_settings
 import ConfigParser
 class Configurator:
    """
    A helper class in the fourmi class. This class is used to process the settings as set
    from one of the Fourmi applications.
    """
    def __init__(self):
        self.scrapy_settings = get_project_settings()
    def set_output(self, filename, fileformat):
        """
        This function manipulates the Scrapy output file settings that normally would be set in the settings file.
        In the Fourmi project these are command line arguments.
        :param filename: The filename of the file where the output will be put.
        :param fileformat: The format in which the output will be.
        """
        if filename != 'results.*format*':
            self.scrapy_settings.overrides["FEED_URI"] = filename
        elif fileformat == "jsonlines":
            self.scrapy_settings.overrides["FEED_URI"] = "results.json"
        elif fileformat is not None:
            self.scrapy_settings.overrides["FEED_URI"] = "results." + fileformat
        if fileformat is not None:
            self.scrapy_settings.overrides["FEED_FORMAT"] = fileformat
    def start_log(self, logfile, verbose):
        """
        This function starts the logging functionality of Scrapy using the settings given by the CLI.
        :param logfile: The location where the logfile will be saved.
        :param verbose: A boolean value to switch between loglevels.
        """
        if logfile is not None:
            if verbose:
                log.start(logfile=logfile, logstdout=False, loglevel=log.DEBUG)
            else:
                log.start(logfile=logfile, logstdout=True, loglevel=log.WARNING)
        else:
            if verbose:
                log.start(logstdout=False, loglevel=log.DEBUG)
            else:
                log.start(logstdout=True, loglevel=log.WARNING)
    @staticmethod
    def read_sourceconfiguration():
        """
        This function reads sources.cfg in the main folder for configuration
        variables for sources
        :return a ConfigParser object of sources.cfg
        """
        config = ConfigParser.ConfigParser()
        config.read('sources.cfg') # [TODO]: should be softcoded eventually
        return config
    @staticmethod
    def get_section(config, sourcename):
        """
        This function reads a config section labeled in variable sourcename and
        tests whether the reliability variable is set else set to empty string.
        Return the default section if the labeled config section does not exist
        :param config: a ConfigParser object
        :param sourcename: the name of the section to be read
        :return a dictionary of the section in the config labeled in sourcename
        """
        section = dict()
        if config.has_section(sourcename):
            section = dict(config.items(sourcename))
        elif config.defaults():
            section = config.defaults()
        if 'reliability' not in section:
            log.msg('Reliability not set for %s' % sourcename,
                    level=log.WARNING)
            section['reliability'] = ''
        return section
--- a/utils/sourceloader.py
+++ b/utils/sourceloader.py
@ -3,26 +3,31 @@ import os
 import re
 from FourmiCrawler.sources.source import Source
-
+from utils.configurator import Configurator
 class SourceLoader:
    sources = []
-    def __init__(self, rel_dir="FourmiCrawler/sources"):
+    def __init__(self, rel_dir="../FourmiCrawler/sources"):
        """
        The initiation of a SourceLoader, selects and indexes a directory for usable sources.
        Also loads a configuration file for Sources and passes the arguments in
        the named section to the source
        :param rel_dir: A relative path to a directory.
        """
        path = os.path.dirname(os.path.abspath(__file__))
        path += "/" + rel_dir
        known_parser = set()
        config = Configurator.read_sourceconfiguration()
        for py in [f[:-3] for f in os.listdir(path) if f.endswith('.py') and f != '__init__.py']:
-            mod = __import__('.'.join([rel_dir.replace("/", "."), py]), fromlist=[py])
+            mod = __import__('.'.join([rel_dir.replace("../", "").replace("/", "."), py]), fromlist=[py])
            classes = [getattr(mod, x) for x in dir(mod) if inspect.isclass(getattr(mod, x))]
            for cls in classes:
                if issubclass(cls, Source) and cls not in known_parser:
-                    self.sources.append(cls())  # [review] - Would we ever need arguments for the parsers?
+                    sourcecfg = Configurator.get_section(config, cls.__name__)
                    self.sources.append(cls(sourcecfg))
                    known_parser.add(cls)
    def include(self, source_names):
@ -55,4 +60,4 @@ class SourceLoader:
            string += "Source: " + src.__class__.__name__
            string += " - "
            string += "URI: " + src.website + "\n"
-        return string
+        return string