trying to merge pubchem with develop cause fourmi on my current branch was broken

2014-06-11 17:36:17 +02:00 · 2014-06-11 17:36:17 +02:00 · 830b785b0a
commit 830b785b0a
parent a903e78f9e a1859f2ec2
15 changed files with 362 additions and 202 deletions
--- a/.gitignore
+++ b/.gitignore
@ -4,6 +4,9 @@
 #Python Specific ignores
 *.pyc

+#may contain authentication information
+sources.cfg
+
 #THINGS WE WOULD NEVER EVER WANT!
 #ignore thumbnails created by windows
 Thumbs.db
--- a/.travis.yml
+++ b/.travis.yml
@ -6,10 +6,14 @@ python: 2.7
 # command to install dependencies, e.g. pip install -r requirements.txt --use-mirrors
 install:
  - pip install Scrapy docopt
+  - pip install coveralls

 # command to run tests, e.g. python setup.py test
 script:
-  - nosetests tests
+  - nosetests --with-coverage --cover-package=FourmiCrawler,utils tests

 notifications:
-  slack: descartes2:6sgCzx3PvrO9IIMwKxj12dDM
+  slack: descartes2:6sgCzx3PvrO9IIMwKxj12dDM
+
+after_success:
+  coveralls --verbose
--- a/FourmiCrawler/sources/ChemSpider.py
+++ b/FourmiCrawler/sources/ChemSpider.py
@ -9,7 +9,7 @@ from FourmiCrawler.items import Result


 # [TODO] - Maybe clean up usage of '.extract()[0]', because of possible IndexError exception.
-
+# [TODO] - Add checks at search request and extendedCompoundInfo on whether the token was valid or not

 class ChemSpider(Source):
    """ChemSpider scraper for synonyms and properties
@ -20,19 +20,23 @@ class ChemSpider(Source):
    somewhere.
    """

-    def __init__(self):
-        Source.__init__(self)
-
    website = 'http://www.chemspider.com/*'

-    # [TODO] - Save and access token of specific user.
-    search = ('Search.asmx/SimpleSearch?query=%s&token='
-              '052bfd06-5ce4-43d6-bf12-89eabefd2338')
+    search = 'Search.asmx/SimpleSearch?query=%s&token='
    structure = 'Chemical-Structure.%s.html'
-    extendedinfo = ('MassSpecAPI.asmx/GetExtendedCompoundInfo?csid=%s&token='
-                    '052bfd06-5ce4-43d6-bf12-89eabefd2338')
+    extendedinfo = 'MassSpecAPI.asmx/GetExtendedCompoundInfo?csid=%s&token='
+
+    def __init__(self, config={}):
+        Source.__init__(self, config)
+        self.cfg = config
+        self.ignore_list = []
+        if 'token' not in self.cfg or self.cfg['token'] == '':
+            log.msg('ChemSpider token not set or empty, search/MassSpec API '
+                    'not available', level=log.WARNING)
+            self.cfg['token'] = ''
+        self.search += self.cfg['token']
+        self.extendedinfo += self.cfg['token']

-    ignore_list = []

    def parse(self, response):
        sel = Selector(response)
@ -44,8 +48,7 @@ class ChemSpider(Source):

        return requests

-    @staticmethod
-    def parse_properties(sel):
+    def parse_properties(self, sel):
        """scrape Experimental Data and Predicted ACD/Labs tabs"""
        properties = []

@ -76,13 +79,12 @@ class ChemSpider(Source):
                prop_value = m.group(1)
                prop_conditions = m.group(2)

-            new_prop = Result({
-                'attribute': prop_name,
-                'value': prop_value,
-                'source': 'ChemSpider Predicted - ACD/Labs Tab',
-                'reliability': 'Unknown',
-                'conditions': prop_conditions
-            })
+            new_prop = self.newresult(
+                attribute=prop_name,
+                value=prop_value,
+                source='ChemSpider Predicted - ACD/Labs Tab',
+                conditions=prop_conditions
+            )
            properties.append(new_prop)
            log.msg('CS prop: |%s| |%s| |%s|' %
                    (new_prop['attribute'], new_prop['value'], new_prop['source']),
@ -100,14 +102,11 @@ class ChemSpider(Source):
            if line.xpath('span/text()'):
                property_name = line.xpath('span/text()').extract()[0].rstrip()
            else:
-                new_prop = Result({
-                    'attribute': property_name[:-1],
-                    'value': line.xpath('text()').extract()[0].rstrip(),
-                    'source': line.xpath(
-                        'strong/text()').extract()[0].rstrip(),
-                    'reliability': 'Unknown',
-                    'conditions': ''
-                })
+                new_prop = self.newresult(
+                    attribute=property_name[:-1],
+                    value=line.xpath('text()').extract()[0].rstrip(),
+                    source=line.xpath('strong/text()').extract()[0].rstrip(),
+                )
                properties.append(new_prop)
                log.msg('CS prop: |%s| |%s| |%s|' %
                        (new_prop['attribute'], new_prop['value'],
@ -183,25 +182,31 @@ class ChemSpider(Source):
        }
        return synonym

-    @staticmethod
-    def parse_extendedinfo(response):
+    def parse_extendedinfo(self, response):
        """Scrape data from the ChemSpider GetExtendedCompoundInfo API"""
        sel = Selector(response)
        properties = []
        names = sel.xpath('*').xpath('name()').extract()
        values = sel.xpath('*').xpath('text()').extract()
        for (name, value) in zip(names, values):
-            result = Result({
-                'attribute': name,
-                'value': value,  # These values have no unit!
-                'source': 'ChemSpider ExtendedCompoundInfo',
-                'reliability': 'Unknown',
-                'conditions': ''
-            })
+            result = self.newresult(
+                attribute=name,
+                value=value,  # These values have no unit!
+                source='ChemSpider ExtendedCompoundInfo',
+            )
            if result['value']:
                properties.append(result)
        return properties

+    def newresult(self, attribute, value, conditions='', source='ChemSpider'):
+        return Result({
+                'attribute': attribute,
+                'value': value,
+                'source': source,
+                'reliability': self.cfg['reliability'],
+                'conditions': conditions
+                })
+
    def parse_searchrequest(self, response):
        """Parse the initial response of the ChemSpider Search API """
        sel = Selector(response)
@ -224,7 +229,7 @@ class ChemSpider(Source):
                        callback=self.parse_extendedinfo)]

    def new_compound_request(self, compound):
-        if compound in self.ignore_list:  # [TODO] - add regular expression
+        if compound in self.ignore_list or self.cfg['token'] == '':
            return None
        searchurl = self.website[:-1] + self.search % compound
        log.msg('chemspider compound', level=log.DEBUG)
--- a/FourmiCrawler/sources/NIST.py
+++ b/FourmiCrawler/sources/NIST.py
@ -22,10 +22,12 @@ class NIST(Source):

    search = 'cgi/cbook.cgi?Name=%s&Units=SI&cTP=on'

-    ignore_list = set()
+    cfg = {}

-    def __init__(self):
-        Source.__init__(self)
+    def __init__(self, config={}):
+        Source.__init__(self, config)
+        self.ignore_list = set()
+        self.cfg = config

    def parse(self, response):
        sel = Selector(response)
@ -114,13 +116,10 @@ class NIST(Source):

        requests = []
        for key, value in data.iteritems():
-            result = Result({
-                'attribute': key,
-                'value': value,
-                'source': 'NIST',
-                'reliability': 'Unknown',
-                'conditions': ''
-            })
+            result = self.newresult(
+                attribute=key,
+                value=value
+            )
            requests.append(result)

        return requests
@ -150,19 +149,16 @@ class NIST(Source):
                name = m.group(1)
                condition = m.group(2)

-            result = Result({
-                'attribute': name,
-                'value': data[1] + ' ' + data[2],
-                'source': 'NIST',
-                'reliability': 'Unknown',
-                'conditions': condition
-            })
+            result = self.newresult(
+                attribute=name,
+                value=data[1] + ' ' + data[2],
+                conditions=condition
+            )
            log.msg('NIST: |%s|' % data, level=log.DEBUG)
            results.append(result)
        return results

-    @staticmethod
-    def parse_transition_data(table, summary):
+    def parse_transition_data(self, table, summary):
        """Parses the table containing properties regarding phase changes"""
        results = []

@ -174,19 +170,16 @@ class NIST(Source):

        for tr in table.xpath('tr[td]'):
            tds = tr.xpath('td/text()').extract()
-            result = Result({
-                'attribute': summary,
-                'value': tds[0] + ' ' + unit,
-                'source': 'NIST',
-                'reliability': 'Unknown',
-                'conditions': '%s K, (%s -> %s)' % (tds[1], tds[2], tds[3])
-            })
+            result = self.newresult(
+                attribute=summary,
+                value=tds[0] + ' ' + unit,
+                conditions='%s K, (%s -> %s)' % (tds[1], tds[2], tds[3])
+            )
            results.append(result)

        return results

-    @staticmethod
-    def parse_generic_data(table, summary):
+    def parse_generic_data(self, table, summary):
        """Parses the common tables of 4 and 5 rows. Assumes they are of the
        form:
        Symbol (unit)|Temperature (K)|Method|Reference|Comment
@ -202,36 +195,30 @@ class NIST(Source):

        for tr in table.xpath('tr[td]'):
            tds = tr.xpath('td/text()').extract()
-            result = Result({
-                'attribute': summary,
-                'value': tds[0] + ' ' + unit,
-                'source': 'NIST',
-                'reliability': 'Unknown',
-                'conditions': '%s K' % tds[1]
-            })
+            result = self.newresult(
+                attribute=summary,
+                value=tds[0] + ' ' + unit,
+                conditions='%s K' % tds[1]
+            )
            results.append(result)
        return results

-    @staticmethod
-    def parse_antoine_data(table, summary):
+    def parse_antoine_data(self, table, summary):
        """Parse table containing parameters for the Antione equation"""
        results = []

        for tr in table.xpath('tr[td]'):
            tds = tr.xpath('td/text()').extract()
-            result = Result({
-                'attribute': summary,
-                'value': 'A=%s, B=%s, C=%s' % (tds[1], tds[2], tds[3]),
-                'source': 'NIST',
-                'reliability': 'Unknown',
-                'conditions': '%s K' % tds[0]
-            })
+            result = self.newresult(
+                attribute=summary,
+                value='A=%s, B=%s, C=%s' % (tds[1], tds[2], tds[3]),
+                conditions='%s K' % tds[0]
+            )
            results.append(result)

        return results

-    @staticmethod
-    def parse_individual_datapoints(response):
+    def parse_individual_datapoints(self, response):
        """Parses the page linked from aggregate data"""
        sel = Selector(response)
        table = sel.xpath('//table[@class="data"]')[0]
@ -258,17 +245,24 @@ class NIST(Source):
            if m:
                uncertainty = '+- %s ' % m.group(1)
                # [TODO]: get the plusminus sign working in here
-            result = Result({
-                'attribute': name,
-                'value': '%s %s%s' % (tds[0], uncertainty, unit),
-                'source': 'NIST',
-                'reliability': 'Unknown',
-                'conditions': condition
-            })
+            result = self.newresult(
+                attribute=name,
+                value='%s %s%s' % (tds[0], uncertainty, unit),
+                conditions=condition
+            )
            results.append(result)

        return results

+    def newresult(self, attribute, value, conditions=''):
+        return Result({
+            'attribute': attribute,
+            'value': value,
+            'source': 'NIST',
+            'reliability': self.cfg['reliability'],
+            'conditions': conditions
+            })
+
    def new_compound_request(self, compound):
        if compound not in self.ignore_list:
            self.ignore_list.update(compound)
--- a/FourmiCrawler/sources/WikipediaParser.py
+++ b/FourmiCrawler/sources/WikipediaParser.py
@ -1,29 +1,34 @@
-import re
-
 from scrapy.http import Request
 from scrapy import log
-from scrapy.selector import Selector
-
 from source import Source
+from scrapy.selector import Selector
 from FourmiCrawler.items import Result
+import re


 class WikipediaParser(Source):
    """ Wikipedia scraper for chemical properties

    This parser parses Wikipedia info boxes (also bordered) to obtain properties and their values.
-     It also returns requests with other external sources which contain information on parsed subject.
+    It also returns requests with other external sources which contain information on parsed subject.
    """

    website = "http://en.wikipedia.org/wiki/*"
    __spider = None
    searched_compounds = []

-    def __init__(self):
-        Source.__init__(self)
+    cfg = {}
+
+    def __init__(self, config={}):
+        Source.__init__(self, config)
+        self.cfg = config

    def parse(self, response):
-        """ Distributes the above described behaviour """
+        """
+        Distributes the above described behaviour
+        :param response: The incoming search request
+        :return: Returns the found properties if response is unique or returns none if it's already known
+        """
        log.msg('A response from %s just arrived!' % response.url, level=log.DEBUG)
        sel = Selector(response)
        compound = sel.xpath('//h1[@id="firstHeading"]//span/text()').extract()[0]  # makes sure to use main page
@ -35,43 +40,21 @@ class WikipediaParser(Source):
            return items

    def parse_infobox(self, sel):
-        """ scrape data from infobox on wikipedia. """
+        """
+        Scrape data from infobox on wikipedia.
+
+        Data from two types of infoboxes: class="infobox bordered" and class="infobox" is scraped and
+        :param sel: The selector with the html-information of the page to parse
+        :return: item_list: Returns a list of properties with their values, source, etc..
+        """
+
        items = []

-        # be sure to get chembox (wikipedia template)
-        tr_list = sel.xpath('.//table[@class="infobox bordered"]//td[not(@colspan)]'). \
-            xpath('normalize-space(string())')
-        prop_names = tr_list[::2]
-        prop_values = tr_list[1::2]
-        for i, prop_name in enumerate(prop_names):
-            item = Result({
-                'attribute': prop_name.extract().encode('utf-8'),
-                'value': prop_values[i].extract().encode('utf-8'),
-                'source': "Wikipedia",
-                'reliability': "Unknown",
-                'conditions': ""
-            })
-            items.append(item)
-            log.msg('Wiki prop: |%s| |%s| |%s|' % (item['attribute'], item['value'], item['source']), level=log.DEBUG)
+        # scrape the chembox (wikipedia template)
+        items = self.parse_chembox(sel, items)

-        #scrape the  drugbox (wikipedia template)
-        tr_list2 = sel.xpath('.//table[@class="infobox"]//tr')
-        log.msg('dit: %s' % tr_list2, level=log.DEBUG)
-        for tablerow in tr_list2:
-            log.msg('item: %s' % tablerow.xpath('./th').xpath('normalize-space(string())'), level=log.DEBUG)
-            if tablerow.xpath('./th').xpath('normalize-space(string())') and tablerow.xpath('./td').xpath(
-                    'normalize-space(string())'):
-                item = Result({
-                    'attribute': tablerow.xpath('./th').xpath('normalize-space(string())').extract()[0].encode('utf-8'),
-                    'value': tablerow.xpath('./td').xpath('normalize-space(string())').extract()[0].encode('utf-8'),
-                    'source': "Wikipedia",
-                    'reliability': "Unknown",
-                    'conditions': ""
-                })
-                items.append(item)
-                log.msg(
-                    'Wiki prop: |attribute: %s| |value: %s| |%s|' % (item['attribute'], item['value'], item['source']),
-                    level=log.DEBUG)
+        #scrape the drugbox (wikipedia template)
+        items = self.parse_drugbox(sel, items)

        items = filter(lambda a: a['value'] != '', items)  # remove items with an empty value
        item_list = self.clean_items(items)
@ -95,12 +78,66 @@ class WikipediaParser(Source):

        return item_list

+    def parse_chembox(self, sel, items):
+        """
+        Scrape data from chembox infobox on wikipedia.
+
+        :param sel: The selector with the html-information of the page to parse
+        :param items: the list of items where the result have to be stored in
+        :return: items: the list of items with the new found and stored items
+        """
+        tr_list = sel.xpath('.//table[@class="infobox bordered"]//td[not(@colspan)]'). \
+            xpath('normalize-space(string())')
+        prop_names = tr_list[::2]
+        prop_values = tr_list[1::2]
+        for i, prop_name in enumerate(prop_names):
+            item = self.newresult(
+                attribute=prop_name.extract().encode('utf-8'),
+                value=prop_values[i].extract().encode('utf-8')
+            )
+            items.append(item)
+            log.msg('Wiki prop: |%s| |%s| |%s|' % (item['attribute'], item['value'], item['source']), level=log.DEBUG)
+        return items
+
+    def parse_drugbox(self, sel, items):
+        """
+        Scrape data from drugbox infobox on wikipedia.
+
+        :param sel: The selector with the html-information of the page to parse
+        :param items: the list of items where the result have to be stored in
+        :return: items: the list of items with the new found and stored items
+        """
+        tr_list2 = sel.xpath('.//table[@class="infobox"]//tr')
+        log.msg('dit: %s' % tr_list2, level=log.DEBUG)
+        for tablerow in tr_list2:
+            log.msg('item: %s' % tablerow.xpath('./th').xpath('normalize-space(string())'), level=log.DEBUG)
+            if tablerow.xpath('./th').xpath('normalize-space(string())') and tablerow.xpath('./td').xpath(
+                    'normalize-space(string())'):
+                item = self.newresult(
+                    attribute=tablerow.xpath('./th').xpath('normalize-space(string())').extract()[0].encode('utf-8'),
+                    value=tablerow.xpath('./td').xpath('normalize-space(string())').extract()[0].encode('utf-8'),
+                )
+                items.append(item)
+                log.msg(
+                    'Wiki prop: |attribute: %s| |value: %s| |%s|' % (item['attribute'], item['value'], item['source']),
+                    level=log.DEBUG)
+        return items
+
+
    def new_compound_request(self, compound):
        return Request(url=self.website[:-1] + compound, callback=self.parse)

    @staticmethod
    def clean_items(items):
-        """ clean up properties using regex, makes it possible to split the values from the units """
+
+        """
+        Clean up properties using regex, makes it possible to split the values from the units
+
+        Almost not in use, only cleans J/K/mol values and boiling/melting points.
+
+        :param items: List of properties with their values, source, etc..
+        :return: items: List of now cleaned up items
+        """
        for item in items:
            value = item['value']
            m = re.search('F;\s(\d+[\.,]?\d*)', value)  # clean up numerical Kelvin value (after F)
@ -113,7 +150,21 @@ class WikipediaParser(Source):

    @staticmethod
    def get_identifiers(sel):
-        """ find external links, named 'Identifiers' to different sources. """
+        """
+        Find external links, named 'Identifiers' to different sources.
+
+        :param sel: The selector with the html-information of the page to parse
+        :return: links: New links which can be used to expand the crawlers search
+        """
        links = sel.xpath('//span[contains(concat(" ",normalize-space(@class)," "),"reflink")]/a'
                          '[contains(concat(" ",normalize-space(@class)," "),"external")]/@href').extract()
-        return links
+        return links
+
+    def newresult(self, attribute, value):
+        return Result({
+            'attribute': attribute,
+            'value': value,
+            'source': 'Wikipedia',
+            'reliability': self.cfg['reliability'],
+            'conditions': ''
+            })
--- a/FourmiCrawler/sources/source.py
+++ b/FourmiCrawler/sources/source.py
@ -6,7 +6,7 @@ class Source:
    website = "http://something/*"  # Regex of URI's the source is able to parse
    _spider = None

-    def __init__(self):
+    def __init__(self, config={}):
        """
        Initiation of a new Source
        """
--- a/FourmiCrawler/spider.py
+++ b/FourmiCrawler/spider.py
@ -9,8 +9,6 @@ class FourmiSpider(Spider):
    A spider writen for the Fourmi Project which calls upon all available sources to request and scrape data.
    """
    name = "FourmiSpider"
-    _sources = []
-    synonyms = set()

    def __init__(self, compound=None, selected_attributes=[".*"], *args, **kwargs):
        """
@ -18,6 +16,8 @@ class FourmiSpider(Spider):
        :param compound: compound that will be searched.
        :param selected_attributes: A list of regular expressions that the attributes should match.
        """
+        self._sources = []
+        self.synonyms = set()
        super(FourmiSpider, self).__init__(*args, **kwargs)
        self.synonyms.add(compound)
        self.selected_attributes = selected_attributes
@ -35,14 +35,14 @@ class FourmiSpider(Spider):
                return source.parse(response)
        return None

-    def get_synonym_requests(self, compound):
+    def get_synonym_requests(self, compound, force=False):
        """
        A function that generates new Scrapy Request for each source given a new synonym of a compound.
        :param compound: A compound name
        :return: A list of Scrapy Request objects
        """
        requests = []
-        if compound not in self.synonyms:
+        if force or compound not in self.synonyms:
            self.synonyms.add(compound)
            for parser in self._sources:
                parser_requests = parser.new_compound_request(compound)
@ -57,7 +57,7 @@ class FourmiSpider(Spider):
        """
        requests = []
        for synonym in self.synonyms:
-            requests.extend(self.get_synonym_requests(synonym))
+            requests.extend(self.get_synonym_requests(synonym, force=True))
        return requests

    def add_sources(self, sources):
--- a/README.md
+++ b/README.md
@ -1,8 +1,8 @@
 # Fourmi

-**Master branch**: [![Build Status](https://travis-ci.org/Recondor/Fourmi.svg?branch=master)](https://travis-ci.org/Recondor/Fourmi)
+**Master branch**: [![Build Status](https://travis-ci.org/jjdekker/Fourmi.svg?branch=master)](https://travis-ci.org/jjdekker/Fourmi)  [![Coverage Status](https://img.shields.io/coveralls/jjdekker/Fourmi.svg)](https://coveralls.io/r/jjdekker/Fourmi?branch=master)

-**Developing branch**: [![Build Status](https://travis-ci.org/Recondor/Fourmi.svg?branch=develop)](https://travis-ci.org/Recondor/Fourmi)
+**Developing branch**: [![Build Status](https://travis-ci.org/jjdekker/Fourmi.svg?branch=develop)](https://travis-ci.org/jjdekker/Fourmi)  [![Coverage Status](https://img.shields.io/coveralls/jjdekker/Fourmi.svg)](https://coveralls.io/r/jjdekker/Fourmi?branch=develop)

 Fourmi is an web scraper for chemical substances. The program is designed to be
 used as a search engine to search multiple chemical databases for a specific
--- a/fourmi.py
+++ b/fourmi.py
@ -1,4 +1,4 @@
-# !/usr/bin/env python
+#!/usr/bin/env python
 """
 Fourmi, a web scraper build to search specific information for a given compound (and it's pseudonyms).

@ -17,8 +17,8 @@ Options:
    --version                       Show version.
    --verbose                       Verbose logging output.
    --log=<file>                    Save log to an file.
-    -o <file> --output=<file>       Output file [default: result.*format*]
-    -f <format> --format=<format>   Output formats (supported: csv, json, jsonlines, xml) [default: jsonlines]
+    -o <file> --output=<file>       Output file [default: results.*format*]
+    -f <format> --format=<format>   Output formats (supported: csv, json, jsonlines, xml) [default: csv]
    --include=<regex>               Include only sources that match these regular expressions split by a comma.
    --exclude=<regex>               Exclude the sources that match these regular expressions split by a comma.
 """
@ -30,7 +30,8 @@ from scrapy.utils.project import get_project_settings
 import docopt

 from FourmiCrawler.spider import FourmiSpider
-from sourceloader import SourceLoader
+from utils.configurator import Configurator
+from utils.sourceloader import SourceLoader


 def setup_crawler(compound, settings, source_loader, attributes):
@ -50,59 +51,22 @@ def setup_crawler(compound, settings, source_loader, attributes):
    crawler.start()


-def scrapy_settings_manipulation(docopt_arguments):
-    """
-    This function manipulates the Scrapy settings that normally would be set in the settings file. In the Fourmi
-    project these are command line arguments.
-    :param docopt_arguments: A dictionary generated by docopt containing all CLI arguments.
-    """
-    settings = get_project_settings()
-
-    if docopt_arguments["--output"] != 'result.*format*':
-        settings.overrides["FEED_URI"] = docopt_arguments["--output"]
-    elif docopt_arguments["--format"] == "jsonlines":
-        settings.overrides["FEED_URI"] = "results.json"
-    elif docopt_arguments["--format"] is not None:
-        settings.overrides["FEED_URI"] = "results." + docopt_arguments["--format"]
-
-    if docopt_arguments["--format"] is not None:
-        settings.overrides["FEED_FORMAT"] = docopt_arguments["--format"]
-
-    return settings
-
-
-def start_log(docopt_arguments):
-    """
-    This function starts the logging functionality of Scrapy using the settings given by the CLI.
-    :param docopt_arguments:  A dictionary generated by docopt containing all CLI arguments.
-    """
-    if docopt_arguments["--log"] is not None:
-        if docopt_arguments["--verbose"]:
-            log.start(logfile=docopt_arguments["--log"], logstdout=False, loglevel=log.DEBUG)
-        else:
-            log.start(logfile=docopt_arguments["--log"], logstdout=True, loglevel=log.WARNING)
-    else:
-        if docopt_arguments["--verbose"]:
-            log.start(logstdout=False, loglevel=log.DEBUG)
-        else:
-            log.start(logstdout=True, loglevel=log.WARNING)
-
-
 def search(docopt_arguments, source_loader):
    """
    The function that facilitates the search for a specific compound.
    :param docopt_arguments: A dictionary generated by docopt containing all CLI arguments.
    :param source_loader: An initiated SourceLoader object pointed at the directory with the sources.
    """
-    start_log(docopt_arguments)
-    settings = scrapy_settings_manipulation(docopt_arguments)
-    setup_crawler(docopt_arguments["<compound>"], settings, source_loader, docopt_arguments["--attributes"].split(','))
+    conf = Configurator()
+    conf.start_log(docopt_arguments["--log"], docopt_arguments["--verbose"])
+    conf.set_output(docopt_arguments["--output"], docopt_arguments["--format"])
+    setup_crawler(docopt_arguments["<compound>"], conf.scrapy_settings, source_loader, docopt_arguments["--attributes"].split(','))
    reactor.run()


 # The start for the Fourmi Command Line interface.
 if __name__ == '__main__':
-    arguments = docopt.docopt(__doc__, version='Fourmi - V0.4.1')
+    arguments = docopt.docopt(__doc__, version='Fourmi - V0.5.0')
    loader = SourceLoader()

    if arguments["--include"]:
--- a/tests/test_configurator.py
+++ b/tests/test_configurator.py
@ -0,0 +1,50 @@
+import unittest
+from utils.configurator import Configurator
+
+import ConfigParser
+
+class TestConfigurator(unittest.TestCase):
+
+    def setUp(self):
+        self.conf = Configurator()
+
+    def test_set_output(self):
+        self.conf.set_output(filename="test.txt", fileformat="csv")
+        self.assertEqual(self.conf.scrapy_settings["FEED_URI"], "test.txt")
+        self.assertEqual(self.conf.scrapy_settings["FEED_FORMAT"], "csv")
+
+        self.conf.set_output("results.*format*", "jsonlines")
+        self.assertEqual(self.conf.scrapy_settings["FEED_URI"], "results.json")
+        self.assertEqual(self.conf.scrapy_settings["FEED_FORMAT"], "jsonlines")
+
+        self.conf.set_output("results.*format*", "csv")
+        self.assertEqual(self.conf.scrapy_settings["FEED_URI"], "results.csv")
+        self.assertEqual(self.conf.scrapy_settings["FEED_FORMAT"], "csv")
+
+    # def test_start_log(self):
+    #     self.conf.start_log("test.log", True)
+    #     self.conf.start_log("test.log", False)
+    #     self.conf.start_log(None, True)
+    #     self.conf.start_log(None, False)
+
+    def test_read_sourceconfiguration(self):
+        config = self.conf.read_sourceconfiguration()
+        self.assertIsInstance(config, ConfigParser.ConfigParser)
+
+    def test_get_section(self):
+        config = ConfigParser.ConfigParser()
+        section = self.conf.get_section(config, 'test')
+        self.assertIn('reliability', section)
+        self.assertEquals(section['reliability'], '')
+
+        config.set('DEFAULT', 'reliability', 'Low')
+
+        section = self.conf.get_section(config, 'test')
+        self.assertEquals(section['reliability'], 'Low')
+
+        config.add_section('test')
+        config.set('test', 'var', 'Maybe')
+
+        section = self.conf.get_section(config, 'test')
+        self.assertEquals(section['reliability'], 'Low')
+        self.assertEqual(section['var'], 'Maybe')
--- a/tests/test_sourceloader.py
+++ b/tests/test_sourceloader.py
@ -1,6 +1,6 @@
 import unittest

-from sourceloader import SourceLoader
+from utils.sourceloader import SourceLoader


 class TestSourceloader(unittest.TestCase):
--- a/tests/test_spider.py
+++ b/tests/test_spider.py
@ -3,7 +3,7 @@ import unittest
 from scrapy.http import Request

 from FourmiCrawler import spider
-from FourmiCrawler.sources.ChemSpider import ChemSpider
+from FourmiCrawler.sources.NIST import NIST
 from FourmiCrawler.sources.source import Source


@ -41,9 +41,12 @@ class TestFoumiSpider(unittest.TestCase):
        self.spi.add_source(src)
        self.assertEqual(self.spi.start_requests(), [])

-        src2 = ChemSpider()
+        src2 = NIST()
        self.spi.add_source(src2)
-        self.assertIsNotNone(self.spi.start_requests())
+        requests = self.spi.start_requests()
+        self.assertGreater(len(requests), 0)
+        self.assertIsInstance(requests[0], Request)
+

    def test_synonym_requests(self):
        # A test for the synonym request function
@ -54,8 +57,8 @@ class TestFoumiSpider(unittest.TestCase):
        self.assertEqual(self.spi.get_synonym_requests("new_compound"), [])
        self.assertIn("new_compound", self.spi.synonyms)

-        src2 = ChemSpider()
+        src2 = NIST()
        self.spi.add_source(src2)
        self.assertIsInstance(self.spi.get_synonym_requests("other_compound")[0], Request)
        self.assertIn("other_compound", self.spi.synonyms)
-        self.assertEqual(self.spi.get_synonym_requests("other_compound"), [])
+        self.assertEqual(self.spi.get_synonym_requests("other_compound"), [])
--- a/utils/init.py
+++ b/utils/init.py
--- a/utils/configurator.py
+++ b/utils/configurator.py
@ -0,0 +1,81 @@
+from scrapy import log
+from scrapy.utils.project import get_project_settings
+import ConfigParser
+
+class Configurator:
+    """
+    A helper class in the fourmi class. This class is used to process the settings as set
+    from one of the Fourmi applications.
+    """
+
+    def __init__(self):
+        self.scrapy_settings = get_project_settings()
+
+
+    def set_output(self, filename, fileformat):
+        """
+        This function manipulates the Scrapy output file settings that normally would be set in the settings file.
+        In the Fourmi project these are command line arguments.
+        :param filename: The filename of the file where the output will be put.
+        :param fileformat: The format in which the output will be.
+        """
+
+        if filename != 'results.*format*':
+            self.scrapy_settings.overrides["FEED_URI"] = filename
+        elif fileformat == "jsonlines":
+            self.scrapy_settings.overrides["FEED_URI"] = "results.json"
+        elif fileformat is not None:
+            self.scrapy_settings.overrides["FEED_URI"] = "results." + fileformat
+
+        if fileformat is not None:
+            self.scrapy_settings.overrides["FEED_FORMAT"] = fileformat
+
+
+    def start_log(self, logfile, verbose):
+        """
+        This function starts the logging functionality of Scrapy using the settings given by the CLI.
+        :param logfile: The location where the logfile will be saved.
+        :param verbose: A boolean value to switch between loglevels.
+        """
+        if logfile is not None:
+            if verbose:
+                log.start(logfile=logfile, logstdout=False, loglevel=log.DEBUG)
+            else:
+                log.start(logfile=logfile, logstdout=True, loglevel=log.WARNING)
+        else:
+            if verbose:
+                log.start(logstdout=False, loglevel=log.DEBUG)
+            else:
+                log.start(logstdout=True, loglevel=log.WARNING)
+
+    @staticmethod
+    def read_sourceconfiguration():
+        """
+        This function reads sources.cfg in the main folder for configuration
+        variables for sources
+        :return a ConfigParser object of sources.cfg
+        """
+        config = ConfigParser.ConfigParser()
+        config.read('sources.cfg') # [TODO]: should be softcoded eventually
+        return config
+
+    @staticmethod
+    def get_section(config, sourcename):
+        """
+        This function reads a config section labeled in variable sourcename and
+        tests whether the reliability variable is set else set to empty string.
+        Return the default section if the labeled config section does not exist
+        :param config: a ConfigParser object
+        :param sourcename: the name of the section to be read
+        :return a dictionary of the section in the config labeled in sourcename
+        """
+        section = dict()
+        if config.has_section(sourcename):
+            section = dict(config.items(sourcename))
+        elif config.defaults():
+            section = config.defaults()
+        if 'reliability' not in section:
+            log.msg('Reliability not set for %s' % sourcename,
+                    level=log.WARNING)
+            section['reliability'] = ''
+        return section
--- a/utils/sourceloader.py
+++ b/utils/sourceloader.py
@ -3,26 +3,31 @@ import os
 import re

 from FourmiCrawler.sources.source import Source
-
+from utils.configurator import Configurator

 class SourceLoader:
    sources = []

-    def __init__(self, rel_dir="FourmiCrawler/sources"):
+    def __init__(self, rel_dir="../FourmiCrawler/sources"):
        """
        The initiation of a SourceLoader, selects and indexes a directory for usable sources.
+        Also loads a configuration file for Sources and passes the arguments in
+        the named section to the source
        :param rel_dir: A relative path to a directory.
        """
        path = os.path.dirname(os.path.abspath(__file__))
        path += "/" + rel_dir
        known_parser = set()

+        config = Configurator.read_sourceconfiguration()
+
        for py in [f[:-3] for f in os.listdir(path) if f.endswith('.py') and f != '__init__.py']:
-            mod = __import__('.'.join([rel_dir.replace("/", "."), py]), fromlist=[py])
+            mod = __import__('.'.join([rel_dir.replace("../", "").replace("/", "."), py]), fromlist=[py])
            classes = [getattr(mod, x) for x in dir(mod) if inspect.isclass(getattr(mod, x))]
            for cls in classes:
                if issubclass(cls, Source) and cls not in known_parser:
-                    self.sources.append(cls())  # [review] - Would we ever need arguments for the parsers?
+                    sourcecfg = Configurator.get_section(config, cls.__name__)
+                    self.sources.append(cls(sourcecfg))
                    known_parser.add(cls)

    def include(self, source_names):
@ -55,4 +60,4 @@ class SourceLoader:
            string += "Source: " + src.__class__.__name__
            string += " - "
            string += "URI: " + src.website + "\n"
-        return string
+        return string