Merge branch 'develop' into feature/PubChem

2014-06-04 18:00:27 +02:00 · 2014-06-04 18:00:27 +02:00 · 86a00b1572
commit 86a00b1572
parent 291547a5ad ecee4a5f45
12 changed files with 553 additions and 59 deletions
--- a/FourmiCrawler/items.py
+++ b/FourmiCrawler/items.py
@ -1,6 +1,4 @@
-# Define here the models for your scraped items
+# For more information on item definitions, see the Scrapy documentation in:
 #
 # See documentation in:
 # http://doc.scrapy.org/en/latest/topics/items.html
 from scrapy.item import Item, Field
--- a/FourmiCrawler/pipelines.py
+++ b/FourmiCrawler/pipelines.py
@ -1,11 +1,27 @@
-# Define your item pipelines here
+# For more information on item pipelines, see the Scrapy documentation in:
-#
+# http://doc.scrapy.org/en/latest/topics/item-pipeline.html
-# Don't forget to add your pipeline to the ITEM_PIPELINES setting
+import re
-# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
+
 from scrapy.exceptions import DropItem
 class RemoveNonePipeline(object):
-class FourmiPipeline(object):
+    def __init__(self):
        self.known_values = set()
    def process_item(self, item, spider):
        """
        Processing the items so None values are replaced by empty strings
        :param item: The incoming item
        :param spider: The spider which scraped the spider
        :return: :raise DropItem: Returns the item if unique or drops them if it's already known
        """
        for key in item:
            if item[key] is None:
                item[key] = ""
        return item
 class DuplicatePipeline(object):
    def __init__(self):
        self.known_values = set()
@ -17,9 +33,27 @@ class FourmiPipeline(object):
        :param spider: The spider which scraped the spider
        :return: :raise DropItem: Returns the item if unique or drops them if it's already known
        """
-        value = item['attribute'], item['value']
+        value = (item['attribute'], item['value'], item['conditions'])
        if value in self.known_values:
            raise DropItem("Duplicate item found: %s" % item) # #[todo] append sources of first item.
        else:
            self.known_values.add(value)
            return item
 class AttributeSelectionPipeline(object):
    def __init__(self):
        pass;
    def process_item(self, item, spider):
        """
        The items are processed using the selected attribute list available in the spider,
        items that don't match the selected items are dropped.
        :param item: The incoming item
        :param spider: The spider which scraped the item. Should have an attribute "selected_attributes".
        :return: :raise DropItem: Returns item if it matches an selected attribute, else it is dropped.
        """
        if [x for x in spider.selected_attributes if re.match(x, item["attribute"])]:
            return item
        else:
            raise DropItem("Attribute not selected by used: %s" % item)
--- a/FourmiCrawler/settings.py
+++ b/FourmiCrawler/settings.py
@ -11,7 +11,9 @@ BOT_NAME = 'FourmiCrawler'
 SPIDER_MODULES = ['FourmiCrawler']
 NEWSPIDER_MODULE = 'FourmiCrawler'
 ITEM_PIPELINES = {
-    'FourmiCrawler.pipelines.FourmiPipeline': 100
+    "FourmiCrawler.pipelines.RemoveNonePipeline": 100,
    'FourmiCrawler.pipelines.AttributeSelectionPipeline': 200,
    'FourmiCrawler.pipelines.DuplicatePipeline': 300,
 }
 FEED_URI = 'results.json'
 FEED_FORMAT = 'jsonlines'
--- a/FourmiCrawler/sources/ChemSpider.py
+++ b/FourmiCrawler/sources/ChemSpider.py
@ -47,7 +47,6 @@ class ChemSpider(Source):
        properties = []
        # Predicted - ACD/Labs tab
        # [TODO] - test if tab contains data, some chemicals do not have data here
        td_list = sel.xpath('.//table[@id="acdlabs-table"]//td').xpath(
            'normalize-space(string())')
        prop_names = td_list[::2]
@ -58,6 +57,12 @@ class ChemSpider(Source):
            prop_value = prop_value.extract().encode('utf-8')
            prop_conditions = ''
            # Test for properties without values, with one hardcoded exception
            if (not re.match(r'^\d', prop_value) or
                    (prop_name == 'Polarizability' and
                    prop_value == '10-24cm3')):
                continue
            # Match for condition in parentheses
            m = re.match(r'(.*) \((.*)\)', prop_name)
            if m:
@ -192,7 +197,8 @@ class ChemSpider(Source):
                'reliability': 'Unknown',
                'conditions': ''
            })
-            properties.append(result)
+            if result['value']:
                properties.append(result)
        return properties
    def parse_searchrequest(self, response):
@ -200,8 +206,14 @@ class ChemSpider(Source):
        sel = Selector(response)
        log.msg('chemspider parse_searchrequest', level=log.DEBUG)
        sel.register_namespace('cs', 'http://www.chemspider.com/')
-        csid = sel.xpath('.//cs:int/text()').extract()[0]
+        csids = sel.xpath('.//cs:int/text()').extract()
-        # [TODO] - handle multiple csids in case of vague search term
+        if len(csids) == 0:
            log.msg('ChemSpider found nothing', level=log.ERROR)
            return
        elif len(csids) > 1:
            log.msg('ChemSpider found multiple substances, taking first '
                    'element', level=log.DEBUG)
        csid = csids[0]
        structure_url = self.website[:-1] + self.structure % csid
        extendedinfo_url = self.website[:-1] + self.extendedinfo % csid
        log.msg('chemspider URL: %s' % structure_url, level=log.DEBUG)
@ -215,4 +227,4 @@ class ChemSpider(Source):
            return None
        searchurl = self.website[:-1] + self.search % compound
        log.msg('chemspider compound', level=log.DEBUG)
-        return Request(url=searchurl, callback=self.parse_searchrequest)
+        return Request(url=searchurl, callback=self.parse_searchrequest)
--- a/FourmiCrawler/sources/NIST.py
+++ b/FourmiCrawler/sources/NIST.py
@ -0,0 +1,273 @@
 from source import Source
 from scrapy import log
 from scrapy.http import Request
 from scrapy.selector import Selector
 from FourmiCrawler.items import Result
 import re
 # [TODO]: values can be '128.', perhaps remove the dot in that case?
 # [TODO]: properties have references and comments which do not exist in the
 #         Result item, but should be included eventually.
 class NIST(Source):
    """NIST Scraper plugin
    This plugin manages searching for a chemical on the NIST website
    and parsing the resulting page if the chemical exists on NIST.
    """
    website = "http://webbook.nist.gov/*"  
    search = 'cgi/cbook.cgi?Name=%s&Units=SI&cTP=on'
    ignore_list = set()
    def __init__(self):
        Source.__init__(self)
    def parse(self, response):
        sel = Selector(response)
        title = sel.xpath('head/title/text()').extract()[0]
        if title == 'Name Not Found':
            log.msg('NIST: Chemical not found!', level=log.ERROR)
            return
        if title not in self.ignore_list:
            self.ignore_list.update(title)
            log.msg('NIST emit synonym: %s' % title, level=log.DEBUG)
            self._spider.get_synonym_requests(title)
        requests = []
        requests.extend(self.parse_generic_info(sel))
        symbol_table = {}
        tds = sel.xpath('//table[@class="symbol_table"]/tr/td')
        for (symbol_td, name_td) in zip(tds[::2], tds[1::2]):
            symbol = ''.join(symbol_td.xpath('node()').extract())
            name = name_td.xpath('text()').extract()[0]
            symbol_table[symbol] = name
            log.msg('NIST symbol: |%s|, name: |%s|' % (symbol, name),
                    level=log.DEBUG)
        for table in sel.xpath('//table[@class="data"]'):
            summary = table.xpath('@summary').extract()[0]
            if summary == 'One dimensional data':
                log.msg('NIST table: Aggregrate data', level=log.DEBUG)
                requests.extend(
                    self.parse_aggregate_data(table, symbol_table))
            elif table.xpath('tr/th="Initial Phase"').extract()[0] == '1':
                log.msg('NIST table; Enthalpy/entropy of phase transition',
                        level=log.DEBUG)
                requests.extend(self.parse_transition_data(table, summary))
            elif table.xpath('tr[1]/td'):
                log.msg('NIST table: Horizontal table', level=log.DEBUG)
            elif summary == 'Antoine Equation Parameters':
                log.msg('NIST table: Antoine Equation Parameters',
                        level=log.DEBUG)
                requests.extend(self.parse_antoine_data(table, summary))
            elif len(table.xpath('tr[1]/th')) == 5:
                log.msg('NIST table: generic 5 columns', level=log.DEBUG)
                # Symbol (unit) Temperature (K) Method Reference Comment
                requests.extend(self.parse_generic_data(table, summary))
            elif len(table.xpath('tr[1]/th')) == 4:
                log.msg('NIST table: generic 4 columns', level=log.DEBUG)
                # Symbol (unit) Temperature (K) Reference Comment
                requests.extend(self.parse_generic_data(table, summary))
            else:
                log.msg('NIST table: NOT SUPPORTED', level=log.WARNING)
                continue #Assume unsupported
        return requests
    def parse_generic_info(self, sel):
        """Parses: synonyms, chemical formula, molecular weight, InChI,
        InChiKey, CAS number
        """
        ul = sel.xpath('body/ul[li/strong="IUPAC Standard InChI:"]')
        li = ul.xpath('li')
        raw_synonyms = ul.xpath('li[strong="Other names:"]/text()').extract()
        for synonym in raw_synonyms[0].strip().split(';\n'):
            log.msg('NIST synonym: %s' % synonym, level=log.DEBUG)
            self.ignore_list.update(synonym)
            self._spider.get_synonym_requests(synonym)
        data = {}
        raw_formula = ul.xpath('li[strong/a="Formula"]//text()').extract()
        data['Chemical formula'] = ''.join(raw_formula[2:]).strip()
        raw_mol_weight = ul.xpath('li[strong/a="Molecular weight"]/text()')
        data['Molecular weight'] = raw_mol_weight.extract()[0].strip()
        raw_inchi = ul.xpath('li[strong="IUPAC Standard InChI:"]//tt/text()')
        data['IUPAC Standard InChI'] = raw_inchi.extract()[0]
        raw_inchikey = ul.xpath('li[strong="IUPAC Standard InChIKey:"]'
                            '/tt/text()')
        data['IUPAC Standard InChIKey'] = raw_inchikey.extract()[0]
        raw_cas_number = ul.xpath('li[strong="CAS Registry Number:"]/text()')
        data['CAS Registry Number'] = raw_cas_number.extract()[0].strip()
        requests = []
        for key, value in data.iteritems():
            result = Result({
                'attribute': key,
                'value': value,
                'source': 'NIST',
                'reliability': 'Unknown',
                'conditions': ''
            })
            requests.append(result)
        return requests
    def parse_aggregate_data(self, table, symbol_table):
        """Parses the table(s) which contain possible links to individual
        data points
        """
        results = []
        for tr in table.xpath('tr[td]'):
            extra_data_url = tr.xpath('td[last()][a="Individual data points"]'
                                '/a/@href').extract()
            if extra_data_url:
                request = Request(url=self.website[:-1] + extra_data_url[0],
                    callback=self.parse_individual_datapoints)
                results.append(request)
                continue
            data = []
            for td in tr.xpath('td'):
                data.append(''.join(td.xpath('node()').extract()))
            name = symbol_table[data[0]]
            condition = ''
            m = re.match(r'(.*) at (.*)', name)
            if m:
                name = m.group(1)
                condition = m.group(2)
            result = Result({
                'attribute': name,
                'value': data[1] + ' ' + data[2],
                'source': 'NIST',
                'reliability': 'Unknown',
                'conditions': condition
            })
            log.msg('NIST: |%s|' % data, level=log.DEBUG)
            results.append(result)
        return results
    @staticmethod
    def parse_transition_data(table, summary):
        """Parses the table containing properties regarding phase changes"""
        results = []
        tr_unit = ''.join(table.xpath('tr[1]/th[1]/node()').extract())
        m = re.search(r'\((.*)\)', tr_unit)
        unit = '!'
        if m:
            unit = m.group(1)
        for tr in table.xpath('tr[td]'):
            tds = tr.xpath('td/text()').extract()
            result = Result({
                'attribute': summary,
                'value': tds[0] + ' ' + unit,
                'source': 'NIST',
                'reliability': 'Unknown',
                'conditions': '%s K, (%s -> %s)' % (tds[1], tds[2], tds[3])
            })
            results.append(result)
        return results
    @staticmethod
    def parse_generic_data(table, summary):
        """Parses the common tables of 4 and 5 rows. Assumes they are of the
        form:
        Symbol (unit)|Temperature (K)|Method|Reference|Comment
        Symbol (unit)|Temperature (K)|Reference|Comment
        """
        results = []
        tr_unit = ''.join(table.xpath('tr[1]/th[1]/node()').extract())
        m = re.search(r'\((.*)\)', tr_unit)
        unit = '!'
        if m:
            unit = m.group(1)
        for tr in table.xpath('tr[td]'):
            tds = tr.xpath('td/text()').extract()
            result = Result({
                'attribute': summary,
                'value': tds[0] + ' ' + unit,
                'source': 'NIST',
                'reliability': 'Unknown',
                'conditions': '%s K' % tds[1]
            })
            results.append(result)
        return results
    @staticmethod
    def parse_antoine_data(table, summary):
        """Parse table containing parameters for the Antione equation"""
        results = []
        for tr in table.xpath('tr[td]'):
            tds = tr.xpath('td/text()').extract()
            result = Result({
                'attribute': summary,
                'value': 'A=%s, B=%s, C=%s' % (tds[1], tds[2], tds[3]),
                'source': 'NIST',
                'reliability': 'Unknown',
                'conditions': '%s K' % tds[0]
            })
            results.append(result)
        return results
    def parse_individual_datapoints(self, response):
        """Parses the page linked from aggregate data"""
        sel = Selector(response)
        table = sel.xpath('//table[@class="data"]')[0]
        results = []
        name = table.xpath('@summary').extract()[0]
        condition = ''
        m = re.match(r'(.*) at (.*)', name)
        if m:
            name = m.group(1)
            condition = m.group(2)
        tr_unit = ''.join(table.xpath('tr[1]/th[1]/node()').extract())
        m = re.search(r'\((.*)\)', tr_unit)
        unit = '!'
        if m:
            unit = m.group(1)
        for tr in table.xpath('tr[td]'):
            tds = tr.xpath('td/text()').extract()
            uncertainty = ''
            m = re.search('Uncertainty assigned by TRC =  (.*?) ', tds[-1])
            if m:
                uncertainty = '+- %s ' % m.group(1)
                # [TODO]: get the plusminus sign working in here
            result = Result({
                'attribute': name,
                'value': '%s %s%s' % (tds[0], uncertainty, unit),
                'source': 'NIST',
                'reliability': 'Unknown',
                'conditions': condition
            })
            results.append(result)
        return results
    def new_compound_request(self, compound):
        if compound not in self.ignore_list:
            self.ignore_list.update(compound)
            return Request(url=self.website[:-1] + self.search % compound,
                           callback=self.parse)
--- a/FourmiCrawler/sources/WikipediaParser.py
+++ b/FourmiCrawler/sources/WikipediaParser.py
@ -36,8 +36,8 @@ class WikipediaParser(Source):
        """ scrape data from infobox on wikipedia. """
        items = []
-        #be sure to get both chembox (wikipedia template) and drugbox (wikipedia template) to scrape
+        #be sure to get chembox (wikipedia template)
-        tr_list = sel.xpath('.//table[@class="infobox bordered" or @class="infobox"]//td[not(@colspan)]').\
+        tr_list = sel.xpath('.//table[@class="infobox bordered"]//td[not(@colspan)]'). \
            xpath('normalize-space(string())')
        prop_names = tr_list[::2]
        prop_values = tr_list[1::2]
@ -46,11 +46,31 @@ class WikipediaParser(Source):
                'attribute': prop_name.extract().encode('utf-8'),
                'value': prop_values[i].extract().encode('utf-8'),
                'source': "Wikipedia",
-                'reliability': "",
+                'reliability': "Unknown",
                'conditions': ""
            })
            items.append(item)
            log.msg('Wiki prop: |%s| |%s| |%s|' % (item['attribute'], item['value'], item['source']), level=log.DEBUG)
        #scrape the  drugbox (wikipedia template)
        tr_list2 = sel.xpath('.//table[@class="infobox"]//tr')
        log.msg('dit: %s' % tr_list2, level=log.DEBUG)
        for tablerow in tr_list2:
            log.msg('item: %s' % tablerow.xpath('./th').xpath('normalize-space(string())'), level=log.DEBUG)
            if tablerow.xpath('./th').xpath('normalize-space(string())') and tablerow.xpath('./td').xpath(
                    'normalize-space(string())'):
                item = Result({
                    'attribute': tablerow.xpath('./th').xpath('normalize-space(string())').extract()[0].encode('utf-8'),
                    'value': tablerow.xpath('./td').xpath('normalize-space(string())').extract()[0].encode('utf-8'),
                    'source': "Wikipedia",
                    'reliability': "Unknown",
                    'conditions': ""
                })
                items.append(item)
                log.msg(
                    'Wiki prop: |attribute: %s| |value: %s| |%s|' % (item['attribute'], item['value'], item['source']),
                    level=log.DEBUG)
        items = filter(lambda a: a['value'] != '', items)  # remove items with an empty value
        item_list = self.clean_items(items)
--- a/FourmiCrawler/sources/source.py
+++ b/FourmiCrawler/sources/source.py
@ -7,15 +7,32 @@ class Source:
    _spider = None
    def __init__(self):
        """
        Initiation of a new Source
        """
        pass
-    def parse(self, reponse):
+    def parse(self, response):
-        log.msg("The parse function of the empty parser was used.", level=log.WARNING)
+        """
        This function should be able to parse all Scrapy Response objects with a URL matching the website Regex.
        :param response: A Scrapy Response object
        :return: A list of Result items and new Scrapy Requests
        """
        log.msg("The parse function of the empty source was used.", level=log.WARNING)
        pass
    def new_compound_request(self, compound):
        """
        This function should return a Scrapy Request for the given compound request.
        :param compound: A compound name.
        :return: A new Scrapy Request
        """
        # return Request(url=self.website[:-1] + compound, callback=self.parse)
        pass
    def set_spider(self, spider):
        """
        A Function to save the associated spider.
        :param spider: A FourmiSpider object
        """
        self._spider = spider
--- a/FourmiCrawler/spider.py
+++ b/FourmiCrawler/spider.py
@ -1,42 +1,75 @@
 import re
 from scrapy.spider import Spider
 from scrapy import log
 import re
 class FourmiSpider(Spider):
    """
    A spider writen for the Fourmi Project which calls upon all available sources to request and scrape data.
    """
    name = "FourmiSpider"
-    __parsers = []
+    __sources = []
    synonyms = []
-    def __init__(self, compound=None, *args, **kwargs):
+    def __init__(self, compound=None, selected_attributes=[".*"], *args, **kwargs):
        """
        Initiation of the Spider
        :param compound: compound that will be searched.
        :param selected_attributes: A list of regular expressions that the attributes should match.
        """
        super(FourmiSpider, self).__init__(*args, **kwargs)
        self.synonyms.append(compound)
        self.selected_attributes = selected_attributes;
-    def parse(self, reponse):
+    def parse(self, response):
-        for parser in self.__parsers:
+        """
-            if re.match(parser.website, reponse.url):
+        The function that is called when a response to a request is available. This function distributes this to a
-                log.msg("Url: " + reponse.url + " -> Source: " + parser.website, level=log.DEBUG)
+        source which should be able to handle parsing the data.
-                return parser.parse(reponse)
+        :param response: A Scrapy Response object that should be parsed
        :return: A list of Result items and new Request to be handled by the scrapy core.
        """
        for source in self.__sources:
            if re.match(source.website, response.url):
                log.msg("Url: " + response.url + " -> Source: " + source.website, level=log.DEBUG)
                return source.parse(response)
        return None
    def get_synonym_requests(self, compound):
        """
        A function that generates new Scrapy Request for each source given a new synonym of a compound.
        :param compound: A compound name
        :return: A list of Scrapy Request objects
        """
        requests = []
-        for parser in self.__parsers:
+        for parser in self.__sources:
            parser_requests = parser.new_compound_request(compound)
            if parser_requests is not None:
                requests.append(parser_requests)
        return requests
    def start_requests(self):
        """
        The function called by Scrapy for it's first Requests
        :return: A list of Scrapy Request generated from the known synonyms using the available sources.
        """
        requests = []
        for synonym in self.synonyms:
            requests.extend(self.get_synonym_requests(synonym))
        return requests
-    def add_parsers(self, parsers):
+    def add_sources(self, sources):
-        for parser in parsers:
+        """
-            self.add_parser(parser)
+        A function to add a new Parser objects to the list of available sources.
        :param sources: A list of Source Objects.
        """
        for parser in sources:
            self.add_source(parser)
-    def add_parser(self, parser):
+    def add_source(self, source):
-        self.__parsers.append(parser)
+        """
-        parser.set_spider(self)
+        A function add a new Parser object to the list of available parsers.
        :param source: A Source Object
        """
        self.__sources.append(source)
        source.set_spider(self)
--- a/README.md
+++ b/README.md
@ -0,0 +1,81 @@
 # Fourmi
 Fourmi is an web scraper for chemical substances. The program is designed to be
 used as a search engine to search multiple chemical databases for a specific
 substance. The program will produce all available attributes of the substance
 and conditions associated with the attributes. Fourmi also attempts to estimate
 the reliability of each data point to assist the user in deciding which data
 should be used.
 The Fourmi project is open source project licensed under the MIT license. Feel
 free to contribute!
 Fourmi is based on the [Scrapy framework](http://scrapy.org/), an open source
 web scraping framework for python. Most of the functionality of this project can
 be traced to this framework. Should the documentation for this application fall
 short, we suggest you take a close look at the [Scrapy architecture]
 (http://doc.scrapy.org/en/latest/topics/architecture.html) and the [Scrapy
 documentation](http://doc.scrapy.org/en/latest/index.html).
 ### Installing 
 If you're installing Fourmi, please take a look at our [installation guide](...)
 on our wiki. When you've installed the application, make sure to check our
 [usage guide](...).
 ### Using the Source
 To use the Fourmi source code multiple dependencies are required. Take a look at
 the [wiki page](...) on using the application source code for a step by step
 installation guide.
 When developing for the Fourmi project keep in mind that code readability is a
 must. To maintain the readability, code should be conform with the
 [PEP-8](http://legacy.python.org/dev/peps/pep-0008/) style guide for Python
 code. More information about the different structures and principles of the
 Fourmi application can be found on our [wiki](...).
 ### To Do
 The Fourmi project has the following goals for the nearby future:
 __Main goals:__
 - Improve our documentation and guides. (Assignee: Dekker)
 - Build an graphical user interface(GUI) as alternative for the command line
 interface(CLI). (Assignee: Harmen)
 - Compiling the source into an windows executable. (Assignee: Bas)
 - Create an configuration file to hold logins and API keys.
 - Determine reliability of our data point.
 - Create an module to gather data from NIST. (Assignee: Rob)
 - Create an module to gather data from PubChem. (Assignee: Nout)
 __Side goals:__
 - Clean and unify data.
 - Extensive reliability analysis using statistical tests.
 - Test data with Descartes 1.
 ### Project Origin
 The Fourmi project was started in February of 2014 as part of a software
 engineering course at the Radboud University for students studying Computer
 Science, Information Science or Artificial Intelligence. Students participate in
 a real software development project as part of the
 [Giphouse](http://www.giphouse.nl/).
 This particular project was started on behalf of Ivo B. Rietveld. As a chemist
 he was in need of an application to automatically search information on chemical
 substances and create an phase diagram. The so called "Descrates" project was
 split into two teams each creating a different application that has part of the
 functionality. We are the team Descartes 2 and as we were responsible for
 creating a web crawler, we've named our application Fourmi (Englis: Ants).
 The following people were part of the original team:
 - [Jip J. Dekker](http://jip.dekker.li)
 - Rob ten Berge
 - Harmen Prins
 - Bas van Berkel
 - Nout van Deijck
 - Michail Kuznetcov
--- a/README.rst
+++ b/README.rst
@ -1,16 +0,0 @@
 We are the team Descartes 2.
 ----------------------------
 Our team members are:
 + Rob ten Berge
 + Bas van Berkel
 + Nout van Deijck
 + Jip J. Dekker
 + Michail Kuznetcov
 + Harmen Prins
--- a/fourmi.py
+++ b/fourmi.py
@ -12,14 +12,15 @@ Usage:
    fourmi --version
 Options:
    --attributes=<regex>            Include only that match these regular expressions split by a comma. [default: .*]
    -h --help                       Show this screen.
    --version                       Show version.
    --verbose                       Verbose logging output.
    --log=<file>                    Save log to an file.
    -o <file> --output=<file>       Output file [default: result.*format*]
    -f <format> --format=<format>   Output formats (supported: csv, json, jsonlines, xml) [default: jsonlines]
-    --include=<sourcenames>         Include only sources that match the regular these expressions split by a comma.
+    --include=<regex>               Include only sources that match these regular expressions split by a comma.
-    --exclude=<sourcenames>         Exclude the sources that match the regular these expressions split by a comma.
+    --exclude=<regex>               Exclude the sources that match these regular expressions split by a comma.
 """
 from twisted.internet import reactor
@ -32,9 +33,16 @@ from FourmiCrawler.spider import FourmiSpider
 from sourceloader import SourceLoader
-def setup_crawler(searchable, settings, source_loader):
+def setup_crawler(compound, settings, source_loader, attributes):
-    spider = FourmiSpider(compound=searchable)
+    """
-    spider.add_parsers(source_loader.sources)
+    This function prepares and start the crawler which starts the actual search on the internet
    :param compound: The compound which should be searched
    :param settings: A scrapy settings object
    :param source_loader: A fully functional SourceLoader object which contains only the sources that should be used.
    :param attributes: A list of regular expressions which the attribute names should match.
    """
    spider = FourmiSpider(compound=compound, selected_attributes=attributes)
    spider.add_sources(source_loader.sources)
    crawler = Crawler(settings)
    crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
    crawler.configure()
@ -43,8 +51,13 @@ def setup_crawler(searchable, settings, source_loader):
 def scrapy_settings_manipulation(docopt_arguments):
    """
    This function manipulates the Scrapy settings that normally would be set in the settings file. In the Fourmi
    project these are command line arguments.
    :param docopt_arguments: A dictionary generated by docopt containing all CLI arguments.
    """
    settings = get_project_settings()
-    # [todo] - add at least a warning for files that already exist
+
    if docopt_arguments["--output"] != 'result.*format*':
        settings.overrides["FEED_URI"] = docopt_arguments["--output"]
    elif docopt_arguments["--format"] == "jsonlines":
@ -59,6 +72,10 @@ def scrapy_settings_manipulation(docopt_arguments):
 def start_log(docopt_arguments):
    """
    This function starts the logging functionality of Scrapy using the settings given by the CLI.
    :param docopt_arguments:  A dictionary generated by docopt containing all CLI arguments.
    """
    if docopt_arguments["--log"] is not None:
        if docopt_arguments["--verbose"]:
            log.start(logfile=docopt_arguments["--log"], logstdout=False, loglevel=log.DEBUG)
@ -72,14 +89,20 @@ def start_log(docopt_arguments):
 def search(docopt_arguments, source_loader):
    """
    The function that facilitates the search for a specific compound.
    :param docopt_arguments: A dictionary generated by docopt containing all CLI arguments.
    :param source_loader: An initiated SourceLoader object pointed at the directory with the sources.
    """
    start_log(docopt_arguments)
    settings = scrapy_settings_manipulation(docopt_arguments)
-    setup_crawler(docopt_arguments["<compound>"], settings, source_loader)
+    setup_crawler(docopt_arguments["<compound>"], settings, source_loader, docopt_arguments["--attributes"].split(','))
    reactor.run()
 # The start for the Fourmi Command Line interface.
 if __name__ == '__main__':
-    arguments = docopt.docopt(__doc__, version='Fourmi - V0.2.6')
+    arguments = docopt.docopt(__doc__, version='Fourmi - V0.4.0')
    loader = SourceLoader()
    if arguments["--include"]:
--- a/sourceloader.py
+++ b/sourceloader.py
@ -1,6 +1,7 @@
 import inspect
 import os
 import re
 from FourmiCrawler.sources.source import Source
@ -8,6 +9,10 @@ class SourceLoader:
    sources = []
    def __init__(self, rel_dir="FourmiCrawler/sources"):
        """
        The initiation of a SourceLoader, selects and indexes a directory for usable sources.
        :param rel_dir: A relative path to a directory.
        """
        path = os.path.dirname(os.path.abspath(__file__))
        path += "/" + rel_dir
        known_parser = set()
@ -21,18 +26,30 @@ class SourceLoader:
                    known_parser.add(cls)
    def include(self, source_names):
        """
        This function excludes all sources that don't match the given regular expressions.
        :param source_names: A list of regular expression (strings)
        """
        new = set()
        for name in source_names:
            new.update([src for src in self.sources if re.match(name, src.__class__.__name__)])
        self.sources = list(new)
    def exclude(self, source_names):
        """
        This function excludes all sources that match the given regular expressions.
        :param source_names: A list of regular expression (strings)
        """
        exclude = []
        for name in source_names:
            exclude.extend([src for src in self.sources if re.match(name, src.__class__.__name__)])
        self.sources = [src for src in self.sources if src not in exclude]
    def __str__(self):
        """
        This function returns a string with all sources currently available in the SourceLoader.
        :return: a string with all available sources.
        """
        string = ""
        for src in self.sources:
            string += "Source: " + src.__class__.__name__