From 971552110a82dadd54b2ec006c3ee165fc17ca35 Mon Sep 17 00:00:00 2001
From: RTB <robtberge@gmail.com>
Date: Tue, 29 Apr 2014 16:30:09 +0200
Subject: [PATCH 01/41] Added test for empty values on properties in ACD/Labs
 tab

---
 FourmiCrawler/sources/ChemSpider.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/FourmiCrawler/sources/ChemSpider.py b/FourmiCrawler/sources/ChemSpider.py
index a62f6dd..332c036 100644
--- a/FourmiCrawler/sources/ChemSpider.py
+++ b/FourmiCrawler/sources/ChemSpider.py
@@ -47,7 +47,6 @@ class ChemSpider(Source):
         properties = []
 
         # Predicted - ACD/Labs tab
-        # [TODO] - test if tab contains data, some chemicals do not have data here
         td_list = sel.xpath('.//table[@id="acdlabs-table"]//td').xpath(
             'normalize-space(string())')
         prop_names = td_list[::2]
@@ -58,6 +57,12 @@ class ChemSpider(Source):
             prop_value = prop_value.extract().encode('utf-8')
             prop_conditions = ''
 
+            # Test for properties without values, with one hardcoded exception
+            if (not re.match(r'^\d', prop_value) or
+                    (prop_name == 'Polarizability' and
+                    prop_value == '10-24cm3')):
+                continue
+
             # Match for condition in parentheses
             m = re.match(r'(.*) \((.*)\)', prop_name)
             if m:
@@ -215,4 +220,4 @@ class ChemSpider(Source):
             return None
         searchurl = self.website[:-1] + self.search % compound
         log.msg('chemspider compound', level=log.DEBUG)
-        return Request(url=searchurl, callback=self.parse_searchrequest)
\ No newline at end of file
+        return Request(url=searchurl, callback=self.parse_searchrequest)

From 73753a6294ea7388b58a0ddc167e2df93b4256f5 Mon Sep 17 00:00:00 2001
From: RTB <robtberge@gmail.com>
Date: Thu, 1 May 2014 12:04:44 +0200
Subject: [PATCH 02/41] chemspider source now handles vague search requests

---
 FourmiCrawler/sources/ChemSpider.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/FourmiCrawler/sources/ChemSpider.py b/FourmiCrawler/sources/ChemSpider.py
index 332c036..d7d125b 100644
--- a/FourmiCrawler/sources/ChemSpider.py
+++ b/FourmiCrawler/sources/ChemSpider.py
@@ -205,8 +205,14 @@ class ChemSpider(Source):
         sel = Selector(response)
         log.msg('chemspider parse_searchrequest', level=log.DEBUG)
         sel.register_namespace('cs', 'http://www.chemspider.com/')
-        csid = sel.xpath('.//cs:int/text()').extract()[0]
-        # [TODO] - handle multiple csids in case of vague search term
+        csids = sel.xpath('.//cs:int/text()').extract()
+        if len(csids) == 0:
+            log.msg('ChemSpider found nothing', level=log.ERROR)
+            return
+        elif len(csids) > 1:
+            log.msg('ChemSpider found multiple substances, taking first '
+                    'element', level=log.DEBUG)
+        csid = csids[0]
         structure_url = self.website[:-1] + self.structure % csid
         extendedinfo_url = self.website[:-1] + self.extendedinfo % csid
         log.msg('chemspider URL: %s' % structure_url, level=log.DEBUG)

From ca0a22ae7b7debebb283dece518e29ecbfca15f9 Mon Sep 17 00:00:00 2001
From: RTB <robtberge@gmail.com>
Date: Thu, 1 May 2014 12:14:52 +0200
Subject: [PATCH 03/41] Added test for empty values on properties in ChemSpider
 ExtendedCompoundInfo API

---
 FourmiCrawler/sources/ChemSpider.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/FourmiCrawler/sources/ChemSpider.py b/FourmiCrawler/sources/ChemSpider.py
index d7d125b..2fcd07c 100644
--- a/FourmiCrawler/sources/ChemSpider.py
+++ b/FourmiCrawler/sources/ChemSpider.py
@@ -197,7 +197,8 @@ class ChemSpider(Source):
                 'reliability': 'Unknown',
                 'conditions': ''
             })
-            properties.append(result)
+            if result['value']:
+                properties.append(result)
         return properties
 
     def parse_searchrequest(self, response):

From 95e9675605deed13f3b2f53ffb47525fadcdbf17 Mon Sep 17 00:00:00 2001
From: Rob tB <r.tenberge@student.ru.nl>
Date: Thu, 1 May 2014 14:57:09 +0200
Subject: [PATCH 04/41] created stub for NIST parser

---
 FourmiCrawler/sources/NIST.py | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)
 create mode 100644 FourmiCrawler/sources/NIST.py

diff --git a/FourmiCrawler/sources/NIST.py b/FourmiCrawler/sources/NIST.py
new file mode 100644
index 0000000..57befff
--- /dev/null
+++ b/FourmiCrawler/sources/NIST.py
@@ -0,0 +1,16 @@
+from scrapy import log
+# from scrapy.http import Request
+
+
+class NIST(Source):
+    website = "http://webbook.nist.gov/*"  
+
+    def __init__(self):
+        Source.__init__(self)
+
+    def parse(self, reponse):
+        pass
+
+    def new_compound_request(self, compound):
+        # return Request(url=self.website[:-1] + compound, callback=self.parse)
+        pass

From f8d390d3e604bedc2a428cc24824830a8bc31d5a Mon Sep 17 00:00:00 2001
From: Bas Vb <bas.berkel@student.ru.nl>
Date: Thu, 1 May 2014 15:04:11 +0200
Subject: [PATCH 05/41] Starting with fixing the wikiparser

---
 FourmiCrawler/sources/WikipediaParser.py | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

diff --git a/FourmiCrawler/sources/WikipediaParser.py b/FourmiCrawler/sources/WikipediaParser.py
index c251fca..8d8cded 100644
--- a/FourmiCrawler/sources/WikipediaParser.py
+++ b/FourmiCrawler/sources/WikipediaParser.py
@@ -37,7 +37,7 @@ class WikipediaParser(Source):
         items = []
 
         #be sure to get both chembox (wikipedia template) and drugbox (wikipedia template) to scrape
-        tr_list = sel.xpath('.//table[@class="infobox bordered" or @class="infobox"]//td[not(@colspan)]').\
+        tr_list = sel.xpath('.//table[@class="infobox bordered"]//td[not(@colspan)]').\
             xpath('normalize-space(string())')
         prop_names = tr_list[::2]
         prop_values = tr_list[1::2]
@@ -51,6 +51,23 @@ class WikipediaParser(Source):
             })
             items.append(item)
             log.msg('Wiki prop: |%s| |%s| |%s|' % (item['attribute'], item['value'], item['source']), level=log.DEBUG)
+
+        tr_list2 = sel.xpath('.//table[@class="infobox"]//tr').\
+            xpath('normalize-space(string())')
+        log.msg('%s' %tr_list2,level=log.DEBUG)
+        #prop_names = tr_list2[::2]
+        #prop_values = tr_list2[1::2]
+        #for i, prop_name in enumerate(prop_names):
+        #    item = Result({
+        #        'attribute': prop_name.extract().encode('utf-8'),
+        #        'value': prop_values[i].extract().encode('utf-8'),
+        #        'source': "Wikipedia",
+        #        'reliability': "",
+        #        'conditions': ""
+        #    })
+        #    items.append(item)
+        #    log.msg('Wiki prop: |%s| |%s| |%s|' % (item['attribute'], item['value'], item['source']), level=log.DEBUG)
+
         items = filter(lambda a: a['value'] != '', items)  # remove items with an empty value
         item_list = self.clean_items(items)
 

From e1e507f745b1dbc16a15b460594f69a3842ff2fe Mon Sep 17 00:00:00 2001
From: Rob tB <r.tenberge@student.ru.nl>
Date: Thu, 1 May 2014 15:30:28 +0200
Subject: [PATCH 06/41] added several required imports

---
 FourmiCrawler/sources/NIST.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/FourmiCrawler/sources/NIST.py b/FourmiCrawler/sources/NIST.py
index 57befff..cbcbeb4 100644
--- a/FourmiCrawler/sources/NIST.py
+++ b/FourmiCrawler/sources/NIST.py
@@ -1,5 +1,8 @@
+from source import Source
 from scrapy import log
-# from scrapy.http import Request
+from scrapy.http import Request
+from scrapy.selector import Selector
+from FourmiCrawler.items import Result
 
 
 class NIST(Source):

From 03e652d454e34dbc30d9f2fa3c6f32ef57845e01 Mon Sep 17 00:00:00 2001
From: Bas Vb <bas.berkel@student.ru.nl>
Date: Thu, 1 May 2014 16:05:37 +0200
Subject: [PATCH 07/41] Wikipediaparser now works on chemboxes as well

---
 FourmiCrawler/sources/WikipediaParser.py | 31 ++++++++++++------------
 1 file changed, 15 insertions(+), 16 deletions(-)

diff --git a/FourmiCrawler/sources/WikipediaParser.py b/FourmiCrawler/sources/WikipediaParser.py
index 8d8cded..2964567 100644
--- a/FourmiCrawler/sources/WikipediaParser.py
+++ b/FourmiCrawler/sources/WikipediaParser.py
@@ -46,27 +46,26 @@ class WikipediaParser(Source):
                 'attribute': prop_name.extract().encode('utf-8'),
                 'value': prop_values[i].extract().encode('utf-8'),
                 'source': "Wikipedia",
-                'reliability': "",
+                'reliability': "Unknown",
                 'conditions': ""
             })
             items.append(item)
             log.msg('Wiki prop: |%s| |%s| |%s|' % (item['attribute'], item['value'], item['source']), level=log.DEBUG)
 
-        tr_list2 = sel.xpath('.//table[@class="infobox"]//tr').\
-            xpath('normalize-space(string())')
-        log.msg('%s' %tr_list2,level=log.DEBUG)
-        #prop_names = tr_list2[::2]
-        #prop_values = tr_list2[1::2]
-        #for i, prop_name in enumerate(prop_names):
-        #    item = Result({
-        #        'attribute': prop_name.extract().encode('utf-8'),
-        #        'value': prop_values[i].extract().encode('utf-8'),
-        #        'source': "Wikipedia",
-        #        'reliability': "",
-        #        'conditions': ""
-        #    })
-        #    items.append(item)
-        #    log.msg('Wiki prop: |%s| |%s| |%s|' % (item['attribute'], item['value'], item['source']), level=log.DEBUG)
+        tr_list2 = sel.xpath('.//table[@class="infobox"]//tr')#.xpath('normalize-space(string())')
+        log.msg('dit: %s' %tr_list2,level=log.DEBUG)
+        for tablerow in tr_list2:
+            log.msg('item: %s' %tablerow.xpath('./th').xpath('normalize-space(string())'),level=log.DEBUG)
+            if tablerow.xpath('./th').xpath('normalize-space(string())') and tablerow.xpath('./td').xpath('normalize-space(string())'):
+                item = Result({
+                    'attribute': tablerow.xpath('./th').xpath('normalize-space(string())').extract()[0].encode('utf-8'),
+                    'value': tablerow.xpath('./td').xpath('normalize-space(string())').extract()[0].encode('utf-8'),
+                    'source': "Wikipedia",
+                    'reliability': "Unknown",
+                    'conditions': ""
+                })
+                items.append(item)
+                log.msg('Wiki prop: |attribute: %s| |value: %s| |%s|' % (item['attribute'], item['value'], item['source']), level=log.DEBUG)
 
         items = filter(lambda a: a['value'] != '', items)  # remove items with an empty value
         item_list = self.clean_items(items)

From 0cec4bd2d8960bb9ffa1e8b3f99f3230bb694580 Mon Sep 17 00:00:00 2001
From: Rob tB <r.tenberge@student.ru.nl>
Date: Sun, 4 May 2014 20:48:51 +0200
Subject: [PATCH 08/41] new_compound_request now returns a Request with a searh
 URL

---
 FourmiCrawler/sources/NIST.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/FourmiCrawler/sources/NIST.py b/FourmiCrawler/sources/NIST.py
index cbcbeb4..2c4337c 100644
--- a/FourmiCrawler/sources/NIST.py
+++ b/FourmiCrawler/sources/NIST.py
@@ -8,6 +8,8 @@ from FourmiCrawler.items import Result
 class NIST(Source):
     website = "http://webbook.nist.gov/*"  
 
+    search = 'cgi/cbook.cgi?Name=%s&Units=SI&cTG=on&cTC=on&cTP=on'
+
     def __init__(self):
         Source.__init__(self)
 
@@ -15,5 +17,5 @@ class NIST(Source):
         pass
 
     def new_compound_request(self, compound):
-        # return Request(url=self.website[:-1] + compound, callback=self.parse)
-        pass
+        return Request(url=self.website[:-1] + self.search % compound,
+                       callback=self.parse)

From 930eb6cad588d49a46b2dea51d0cbe72565c4763 Mon Sep 17 00:00:00 2001
From: Rob tB <r.tenberge@student.ru.nl>
Date: Sun, 4 May 2014 21:20:46 +0200
Subject: [PATCH 09/41] NIST now scrapes the symbol table for later use

---
 FourmiCrawler/sources/NIST.py | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/FourmiCrawler/sources/NIST.py b/FourmiCrawler/sources/NIST.py
index 2c4337c..44a8037 100644
--- a/FourmiCrawler/sources/NIST.py
+++ b/FourmiCrawler/sources/NIST.py
@@ -13,8 +13,17 @@ class NIST(Source):
     def __init__(self):
         Source.__init__(self)
 
-    def parse(self, reponse):
-        pass
+    def parse(self, response):
+        sel = Selector(response)
+
+        symbol_table = {}
+        tds = sel.xpath('//table[@class="symbol_table"]/tr/td')
+        for (symbol_td, name_td) in zip(tds[::2], tds[1::2]):
+            symbol = ''.join(symbol_td.xpath('node()').extract())
+            name = name_td.xpath('text()').extract()[0]
+            symbol_table[symbol] = name
+            log.msg('NIST symbol: |%s|, name: |%s|' % (symbol, name),
+                    level=log.DEBUG)
 
     def new_compound_request(self, compound):
         return Request(url=self.website[:-1] + self.search % compound,

From 9c80f291b6230f2a0d3958773df33f9f6e1e6f82 Mon Sep 17 00:00:00 2001
From: RTB <robtberge@gmail.com>
Date: Wed, 7 May 2014 13:27:22 +0200
Subject: [PATCH 10/41] search NIST exclusively for phase change data

---
 FourmiCrawler/sources/NIST.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/FourmiCrawler/sources/NIST.py b/FourmiCrawler/sources/NIST.py
index 44a8037..cd049a0 100644
--- a/FourmiCrawler/sources/NIST.py
+++ b/FourmiCrawler/sources/NIST.py
@@ -8,7 +8,7 @@ from FourmiCrawler.items import Result
 class NIST(Source):
     website = "http://webbook.nist.gov/*"  
 
-    search = 'cgi/cbook.cgi?Name=%s&Units=SI&cTG=on&cTC=on&cTP=on'
+    search = 'cgi/cbook.cgi?Name=%s&Units=SI&cTP=on'
 
     def __init__(self):
         Source.__init__(self)

From 95e24f9c44778a18d764e6bfc09d99d16b2fdb87 Mon Sep 17 00:00:00 2001
From: RTB <robtberge@gmail.com>
Date: Wed, 7 May 2014 17:09:42 +0200
Subject: [PATCH 11/41] added code to recognize various table formats

---
 FourmiCrawler/sources/NIST.py | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/FourmiCrawler/sources/NIST.py b/FourmiCrawler/sources/NIST.py
index cd049a0..37f8d04 100644
--- a/FourmiCrawler/sources/NIST.py
+++ b/FourmiCrawler/sources/NIST.py
@@ -25,6 +25,28 @@ class NIST(Source):
             log.msg('NIST symbol: |%s|, name: |%s|' % (symbol, name),
                     level=log.DEBUG)
 
+        for tables in sel.xpath('//table[@class="data"]'):
+            if tables.xpath('@summary').extract()[0] == 'One dimensional data':
+                log.msg('NIST table: Aggregrate data', level=log.DEBUG)
+            elif tables.xpath('tr/th="Initial Phase"').extract()[0] == '1':
+                log.msg('NIST table; Enthalpy/entropy of phase transition',
+                        level=log.DEBUG)
+            elif tables.xpath('tr[1]/td'):
+                log.msg('NIST table: Horizontal table', level=log.DEBUG)
+            elif (tables.xpath('@summary').extract()[0] ==
+                    'Antoine Equation Parameters'):
+                log.msg('NIST table: Antoine Equation Parameters',
+                        level=log.DEBUG)
+            elif len(tables.xpath('tr[1]/th')) == 5:
+                log.msg('NIST table: generic 5 columns', level=log.DEBUG)
+                # Symbol (unit) Temperature (K) Method Reference Comment
+            elif len(tables.xpath('tr[1]/th')) == 4:
+                log.msg('NIST table: generic 4 columns', level=log.DEBUG)
+                # Symbol (unit) Temperature (K) Reference Comment
+            else:
+                log.msg('NIST table: NOT SUPPORTED', level=log.WARNING)
+                continue #Assume unsupported
+
     def new_compound_request(self, compound):
         return Request(url=self.website[:-1] + self.search % compound,
                        callback=self.parse)

From 85595ecf350e173692420f91eb439571444fed86 Mon Sep 17 00:00:00 2001
From: RTB <robtberge@gmail.com>
Date: Wed, 7 May 2014 18:12:08 +0200
Subject: [PATCH 12/41] created function to start scraping the aggregate data
 table

---
 FourmiCrawler/sources/NIST.py | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/FourmiCrawler/sources/NIST.py b/FourmiCrawler/sources/NIST.py
index 37f8d04..6e884ef 100644
--- a/FourmiCrawler/sources/NIST.py
+++ b/FourmiCrawler/sources/NIST.py
@@ -16,6 +16,8 @@ class NIST(Source):
     def parse(self, response):
         sel = Selector(response)
 
+        requests = []
+
         symbol_table = {}
         tds = sel.xpath('//table[@class="symbol_table"]/tr/td')
         for (symbol_td, name_td) in zip(tds[::2], tds[1::2]):
@@ -28,6 +30,8 @@ class NIST(Source):
         for tables in sel.xpath('//table[@class="data"]'):
             if tables.xpath('@summary').extract()[0] == 'One dimensional data':
                 log.msg('NIST table: Aggregrate data', level=log.DEBUG)
+                requests.extend(
+                    self.parse_aggregate_data(tables, symbol_table))
             elif tables.xpath('tr/th="Initial Phase"').extract()[0] == '1':
                 log.msg('NIST table; Enthalpy/entropy of phase transition',
                         level=log.DEBUG)
@@ -46,6 +50,25 @@ class NIST(Source):
             else:
                 log.msg('NIST table: NOT SUPPORTED', level=log.WARNING)
                 continue #Assume unsupported
+        return requests
+
+    @staticmethod
+    def parse_aggregate_data(table, symbol_table):
+        results = []
+        for tr in table.xpath('tr[td]'):
+            data = []
+            for td in tr.xpath('td'):
+                data.append(''.join(td.xpath('node()').extract()))
+            result = Result({
+                'attribute': symbol_table[data[0]],
+                'value': data[1] + ' ' + data[2],
+                'source': 'NIST',
+                'reliability': 'Unknown',
+                'conditions': ''
+            })
+            log.msg('NIST: |%s|' % data, level=log.DEBUG)
+            results.append(result)
+        return results
 
     def new_compound_request(self, compound):
         return Request(url=self.website[:-1] + self.search % compound,

From 10dd74e02617401f80e32c29dd95513fe93248a7 Mon Sep 17 00:00:00 2001
From: RTB <robtberge@gmail.com>
Date: Wed, 7 May 2014 21:58:52 +0200
Subject: [PATCH 13/41] added function to scrape transition tables

---
 FourmiCrawler/sources/NIST.py | 26 +++++++++++++++++++++++++-
 1 file changed, 25 insertions(+), 1 deletion(-)

diff --git a/FourmiCrawler/sources/NIST.py b/FourmiCrawler/sources/NIST.py
index 6e884ef..0191fee 100644
--- a/FourmiCrawler/sources/NIST.py
+++ b/FourmiCrawler/sources/NIST.py
@@ -3,7 +3,7 @@ from scrapy import log
 from scrapy.http import Request
 from scrapy.selector import Selector
 from FourmiCrawler.items import Result
-
+import re
 
 class NIST(Source):
     website = "http://webbook.nist.gov/*"  
@@ -35,6 +35,8 @@ class NIST(Source):
             elif tables.xpath('tr/th="Initial Phase"').extract()[0] == '1':
                 log.msg('NIST table; Enthalpy/entropy of phase transition',
                         level=log.DEBUG)
+                requests.extend(
+                    self.parse_transition_data(tables, symbol_table))
             elif tables.xpath('tr[1]/td'):
                 log.msg('NIST table: Horizontal table', level=log.DEBUG)
             elif (tables.xpath('@summary').extract()[0] ==
@@ -70,6 +72,28 @@ class NIST(Source):
             results.append(result)
         return results
 
+    @staticmethod
+    def parse_transition_data(table, symbol_table):
+        results = []
+
+        name = table.xpath('@summary').extract()[0]
+        unit = table.xpath('tr[1]/th[1]/node()').extract()[-1][2:-1]
+
+        for tr in table.xpath('tr[td]'):
+            tds = tr.xpath('td/text()').extract()
+            result = Result({
+                'attribute': name,
+                'value': tds[0] + ' ' + unit,
+                'source': 'NIST',
+                'reliability': 'Unknown',
+                'conditions': '%s K, (%s -> %s)' % (tds[1], tds[2], tds[3])
+            })
+            log.msg('NIST: |%s|' % result, level=log.DEBUG)
+            results.append(result)
+
+
+        return results
+
     def new_compound_request(self, compound):
         return Request(url=self.website[:-1] + self.search % compound,
                        callback=self.parse)

From 7abb491d3fdcd92661b9c7dc8a7ebfbd686e4cbb Mon Sep 17 00:00:00 2001
From: RTB <robtberge@gmail.com>
Date: Wed, 7 May 2014 22:08:43 +0200
Subject: [PATCH 14/41] added function for most generic tables

---
 FourmiCrawler/sources/NIST.py | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/FourmiCrawler/sources/NIST.py b/FourmiCrawler/sources/NIST.py
index 0191fee..5757546 100644
--- a/FourmiCrawler/sources/NIST.py
+++ b/FourmiCrawler/sources/NIST.py
@@ -46,9 +46,13 @@ class NIST(Source):
             elif len(tables.xpath('tr[1]/th')) == 5:
                 log.msg('NIST table: generic 5 columns', level=log.DEBUG)
                 # Symbol (unit) Temperature (K) Method Reference Comment
+                requests.extend(
+                    self.parse_generic_data(tables))
             elif len(tables.xpath('tr[1]/th')) == 4:
                 log.msg('NIST table: generic 4 columns', level=log.DEBUG)
                 # Symbol (unit) Temperature (K) Reference Comment
+                requests.extend(
+                    self.parse_generic_data(tables))
             else:
                 log.msg('NIST table: NOT SUPPORTED', level=log.WARNING)
                 continue #Assume unsupported
@@ -94,6 +98,26 @@ class NIST(Source):
 
         return results
 
+    @staticmethod
+    def parse_generic_data(table):
+        results = []
+
+        name = table.xpath('@summary').extract()[0]
+        unit = table.xpath('tr[1]/th[1]/node()').extract()[-1][2:-1]
+
+        for tr in table.xpath('tr[td]'):
+            tds = tr.xpath('td/text()').extract()
+            result = Result({
+                'attribute': name,
+                'value': tds[0] + ' ' + unit,
+                'source': 'NIST',
+                'reliability': 'Unknown',
+                'conditions': '%s K' % tds[1]
+            })
+            log.msg('NIST: |%s|' % result, level=log.DEBUG)
+            results.append(result)
+        return results
+
     def new_compound_request(self, compound):
         return Request(url=self.website[:-1] + self.search % compound,
                        callback=self.parse)

From 151f1988a16d4fb58134320106b7712105083681 Mon Sep 17 00:00:00 2001
From: Rob tB <r.tenberge@student.ru.nl>
Date: Thu, 8 May 2014 14:54:30 +0200
Subject: [PATCH 15/41] added function to scrape table for Antoine equation
 parameters

---
 FourmiCrawler/sources/NIST.py | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/FourmiCrawler/sources/NIST.py b/FourmiCrawler/sources/NIST.py
index 5757546..aba0ec2 100644
--- a/FourmiCrawler/sources/NIST.py
+++ b/FourmiCrawler/sources/NIST.py
@@ -43,6 +43,8 @@ class NIST(Source):
                     'Antoine Equation Parameters'):
                 log.msg('NIST table: Antoine Equation Parameters',
                         level=log.DEBUG)
+                requests.extend(
+                    self.parse_antoine_data(tables))
             elif len(tables.xpath('tr[1]/th')) == 5:
                 log.msg('NIST table: generic 5 columns', level=log.DEBUG)
                 # Symbol (unit) Temperature (K) Method Reference Comment
@@ -118,6 +120,25 @@ class NIST(Source):
             results.append(result)
         return results
 
+    @staticmethod
+    def parse_antoine_data(table):
+        results = []
+
+        name = table.xpath('@summary').extract()[0]
+
+        for tr in table.xpath('tr[td]'):
+            tds = tr.xpath('td/text()').extract()
+            result = Result({
+                'attribute': name,
+                'value': 'A=%s, B=%s, C=%s' % (tds[1], tds[2], tds[3]),
+                'source': 'NIST',
+                'reliability': 'Unknown',
+                'conditions': '%s K' % tds[0]
+            })
+            results.append(result)
+
+        return results
+
     def new_compound_request(self, compound):
         return Request(url=self.website[:-1] + self.search % compound,
                        callback=self.parse)

From f6fa5e8adf283810ccb12389642015a37df44431 Mon Sep 17 00:00:00 2001
From: Rob tB <r.tenberge@student.ru.nl>
Date: Thu, 8 May 2014 15:22:48 +0200
Subject: [PATCH 16/41] fixed scraping of unit for two kinds of tables

---
 FourmiCrawler/sources/NIST.py | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/FourmiCrawler/sources/NIST.py b/FourmiCrawler/sources/NIST.py
index aba0ec2..0ce3a28 100644
--- a/FourmiCrawler/sources/NIST.py
+++ b/FourmiCrawler/sources/NIST.py
@@ -83,7 +83,11 @@ class NIST(Source):
         results = []
 
         name = table.xpath('@summary').extract()[0]
-        unit = table.xpath('tr[1]/th[1]/node()').extract()[-1][2:-1]
+        tr_unit = ''.join(table.xpath('tr[1]/th[1]/node()').extract())
+        m = re.search(r'\((.*)\)', tr_unit)
+        unit = '!'
+        if m:
+            unit = m.group(1)
 
         for tr in table.xpath('tr[td]'):
             tds = tr.xpath('td/text()').extract()
@@ -105,7 +109,11 @@ class NIST(Source):
         results = []
 
         name = table.xpath('@summary').extract()[0]
-        unit = table.xpath('tr[1]/th[1]/node()').extract()[-1][2:-1]
+        tr_unit = ''.join(table.xpath('tr[1]/th[1]/node()').extract())
+        m = re.search(r'\((.*)\)', tr_unit)
+        unit = '!'
+        if m:
+            unit = m.group(1)
 
         for tr in table.xpath('tr[td]'):
             tds = tr.xpath('td/text()').extract()

From 74dddace883e84b8bad787751ccf961f7125eca4 Mon Sep 17 00:00:00 2001
From: Rob tB <r.tenberge@student.ru.nl>
Date: Thu, 8 May 2014 15:42:53 +0200
Subject: [PATCH 17/41] removed logging of Result objects in debug messages
 because pointless

---
 FourmiCrawler/sources/NIST.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/FourmiCrawler/sources/NIST.py b/FourmiCrawler/sources/NIST.py
index 0ce3a28..635a61b 100644
--- a/FourmiCrawler/sources/NIST.py
+++ b/FourmiCrawler/sources/NIST.py
@@ -98,7 +98,6 @@ class NIST(Source):
                 'reliability': 'Unknown',
                 'conditions': '%s K, (%s -> %s)' % (tds[1], tds[2], tds[3])
             })
-            log.msg('NIST: |%s|' % result, level=log.DEBUG)
             results.append(result)
 
 
@@ -124,7 +123,6 @@ class NIST(Source):
                 'reliability': 'Unknown',
                 'conditions': '%s K' % tds[1]
             })
-            log.msg('NIST: |%s|' % result, level=log.DEBUG)
             results.append(result)
         return results
 

From 5e067fd57297769b63dc102352f858024233fec3 Mon Sep 17 00:00:00 2001
From: RTB <robtberge@gmail.com>
Date: Fri, 9 May 2014 12:36:54 +0200
Subject: [PATCH 18/41] altered scraping of aggregate data to test for and
 request url to individual data points

---
 FourmiCrawler/sources/NIST.py | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/FourmiCrawler/sources/NIST.py b/FourmiCrawler/sources/NIST.py
index 635a61b..1222c63 100644
--- a/FourmiCrawler/sources/NIST.py
+++ b/FourmiCrawler/sources/NIST.py
@@ -60,10 +60,16 @@ class NIST(Source):
                 continue #Assume unsupported
         return requests
 
-    @staticmethod
-    def parse_aggregate_data(table, symbol_table):
+    def parse_aggregate_data(self, table, symbol_table):
         results = []
         for tr in table.xpath('tr[td]'):
+            extra_data_url = tr.xpath('td[last()][a="Individual data points"]'
+                                '/a/@href').extract()
+            if extra_data_url:
+                request = Request(url=self.website[:-1] + extra_data_url[0],
+                    callback=self.parse_individual_datapoints)
+                results.append(request)
+                continue
             data = []
             for td in tr.xpath('td'):
                 data.append(''.join(td.xpath('node()').extract()))
@@ -145,6 +151,9 @@ class NIST(Source):
 
         return results
 
+    def parse_individual_datapoints(self, response):
+        pass
+
     def new_compound_request(self, compound):
         return Request(url=self.website[:-1] + self.search % compound,
                        callback=self.parse)

From 775a920b9bc72f2d6f7e08624a0203af4f0b0a22 Mon Sep 17 00:00:00 2001
From: RTB <robtberge@gmail.com>
Date: Fri, 9 May 2014 13:00:22 +0200
Subject: [PATCH 19/41] NIST scraper now handles urls with individual data
 points

---
 FourmiCrawler/sources/NIST.py | 25 ++++++++++++++++++++++++-
 1 file changed, 24 insertions(+), 1 deletion(-)

diff --git a/FourmiCrawler/sources/NIST.py b/FourmiCrawler/sources/NIST.py
index 1222c63..6ae6862 100644
--- a/FourmiCrawler/sources/NIST.py
+++ b/FourmiCrawler/sources/NIST.py
@@ -152,7 +152,30 @@ class NIST(Source):
         return results
 
     def parse_individual_datapoints(self, response):
-        pass
+        sel = Selector(response)
+        table = sel.xpath('//table[@class="data"]')[0]
+
+        results = []
+
+        name = table.xpath('@summary').extract()[0]
+        tr_unit = ''.join(table.xpath('tr[1]/th[1]/node()').extract())
+        m = re.search(r'\((.*)\)', tr_unit)
+        unit = '!'
+        if m:
+            unit = m.group(1)
+
+        for tr in table.xpath('tr[td]'):
+            tds = tr.xpath('td/text()').extract()
+            result = Result({
+                'attribute': name,
+                'value': '%s %s' % (tds[0], unit),
+                'source': 'NIST',
+                'reliability': 'Unknown',
+                'conditions': ''
+            })
+            results.append(result)
+
+        return results
 
     def new_compound_request(self, compound):
         return Request(url=self.website[:-1] + self.search % compound,

From 7e984b60d8a8d42d832a160554e356ab833a5419 Mon Sep 17 00:00:00 2001
From: RTB <robtberge@gmail.com>
Date: Fri, 9 May 2014 14:24:08 +0200
Subject: [PATCH 20/41] added uncertainty to results from scraping individual
 data points urls

---
 FourmiCrawler/sources/NIST.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/FourmiCrawler/sources/NIST.py b/FourmiCrawler/sources/NIST.py
index 6ae6862..2d3c672 100644
--- a/FourmiCrawler/sources/NIST.py
+++ b/FourmiCrawler/sources/NIST.py
@@ -166,9 +166,14 @@ class NIST(Source):
 
         for tr in table.xpath('tr[td]'):
             tds = tr.xpath('td/text()').extract()
+            uncertainty = ''
+            m = re.search('Uncertainty assigned by TRC =  (.*?) ', tds[-1])
+            if m:
+                uncertainty = '+- %s ' % m.group(1)
+                # [TODO]: get the plusminus sign working in here
             result = Result({
                 'attribute': name,
-                'value': '%s %s' % (tds[0], unit),
+                'value': '%s %s%s' % (tds[0], uncertainty, unit),
                 'source': 'NIST',
                 'reliability': 'Unknown',
                 'conditions': ''

From b54568bab0281ab80ef9ce2e4ec3a94138322447 Mon Sep 17 00:00:00 2001
From: Bas Vb <bas.berkel@student.ru.nl>
Date: Tue, 13 May 2014 16:18:32 +0200
Subject: [PATCH 21/41] Small fixes

---
 FourmiCrawler/sources/WikipediaParser.py | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/FourmiCrawler/sources/WikipediaParser.py b/FourmiCrawler/sources/WikipediaParser.py
index 2964567..cb7d0b9 100644
--- a/FourmiCrawler/sources/WikipediaParser.py
+++ b/FourmiCrawler/sources/WikipediaParser.py
@@ -36,8 +36,8 @@ class WikipediaParser(Source):
         """ scrape data from infobox on wikipedia. """
         items = []
 
-        #be sure to get both chembox (wikipedia template) and drugbox (wikipedia template) to scrape
-        tr_list = sel.xpath('.//table[@class="infobox bordered"]//td[not(@colspan)]').\
+        #be sure to get chembox (wikipedia template)
+        tr_list = sel.xpath('.//table[@class="infobox bordered"]//td[not(@colspan)]'). \
             xpath('normalize-space(string())')
         prop_names = tr_list[::2]
         prop_values = tr_list[1::2]
@@ -52,11 +52,13 @@ class WikipediaParser(Source):
             items.append(item)
             log.msg('Wiki prop: |%s| |%s| |%s|' % (item['attribute'], item['value'], item['source']), level=log.DEBUG)
 
-        tr_list2 = sel.xpath('.//table[@class="infobox"]//tr')#.xpath('normalize-space(string())')
-        log.msg('dit: %s' %tr_list2,level=log.DEBUG)
+        #scrape the  drugbox (wikipedia template)
+        tr_list2 = sel.xpath('.//table[@class="infobox"]//tr')
+        log.msg('dit: %s' % tr_list2, level=log.DEBUG)
         for tablerow in tr_list2:
-            log.msg('item: %s' %tablerow.xpath('./th').xpath('normalize-space(string())'),level=log.DEBUG)
-            if tablerow.xpath('./th').xpath('normalize-space(string())') and tablerow.xpath('./td').xpath('normalize-space(string())'):
+            log.msg('item: %s' % tablerow.xpath('./th').xpath('normalize-space(string())'), level=log.DEBUG)
+            if tablerow.xpath('./th').xpath('normalize-space(string())') and tablerow.xpath('./td').xpath(
+                    'normalize-space(string())'):
                 item = Result({
                     'attribute': tablerow.xpath('./th').xpath('normalize-space(string())').extract()[0].encode('utf-8'),
                     'value': tablerow.xpath('./td').xpath('normalize-space(string())').extract()[0].encode('utf-8'),
@@ -65,7 +67,9 @@ class WikipediaParser(Source):
                     'conditions': ""
                 })
                 items.append(item)
-                log.msg('Wiki prop: |attribute: %s| |value: %s| |%s|' % (item['attribute'], item['value'], item['source']), level=log.DEBUG)
+                log.msg(
+                    'Wiki prop: |attribute: %s| |value: %s| |%s|' % (item['attribute'], item['value'], item['source']),
+                    level=log.DEBUG)
 
         items = filter(lambda a: a['value'] != '', items)  # remove items with an empty value
         item_list = self.clean_items(items)

From 0a2bfeb14990f5b217b122ba8bc256574a0a11bd Mon Sep 17 00:00:00 2001
From: "Jip J. Dekker" <jip@dekker.li>
Date: Tue, 13 May 2014 21:43:16 +0200
Subject: [PATCH 22/41] I'm more experienced with Markdown

---
 README.rst => README.md | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename README.rst => README.md (100%)

diff --git a/README.rst b/README.md
similarity index 100%
rename from README.rst
rename to README.md

From b6ae4977d90f4f427786048cb11de26ffbe49d85 Mon Sep 17 00:00:00 2001
From: "Jip J. Dekker" <jip@dekker.li>
Date: Tue, 13 May 2014 23:26:31 +0200
Subject: [PATCH 23/41] Complete rewrite of the README

---
 README.md | 96 +++++++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 87 insertions(+), 9 deletions(-)

diff --git a/README.md b/README.md
index c251791..4732c56 100644
--- a/README.md
+++ b/README.md
@@ -1,16 +1,94 @@
-We are the team Descartes 2.
-----------------------------
+# Fourmi
 
-Our team members are:
+Fourmi is an web scraper for chemical substances. The program is designed to be
+used as a search engine to search multiple chemical databases for a specific
+substance. The program will produce all available attributes of the substance
+and conditions associated with the attributes. Fourmi also attempts to estimate
+the reliability of each data point to assist the user in deciding which data
+should be used.
 
-+ Rob ten Berge
+The Fourmi project is open source project licensed under the MIT license. Feel
+free to contribute!
 
-+ Bas van Berkel
+Fourmi is based on the [Scrapy framework](http://scrapy.org/), an open source
+web scraping framework for python. Most of the functionality of this project can
+be traced to this framework. Should the documentation for this application fall
+short, we suggest you take a close look at the [Scrapy architecture]
+(http://doc.scrapy.org/en/latest/topics/architecture.html) and the [Scrapy
+documentation](http://doc.scrapy.org/en/latest/index.html).
 
-+ Nout van Deijck
+### Installing 
 
-+ Jip J. Dekker
+If you're installing Fourmi, please take a look at our [installation guide](...)
+on our wiki. When you've installed the application, make sure to check our
+[usage guide](...).
 
-+ Michail Kuznetcov
+### Using the Source
 
-+ Harmen Prins
\ No newline at end of file
+To use the Fourmi source code multiple dependencies are required. Take a look at
+the [wiki page](...) on using the application source code for a step by step
+installation guide.
+
+When developing for the Fourmi project keep in mind that code readability is a
+must. To maintain the readability, code should be conform with the
+[PEP-8](http://legacy.python.org/dev/peps/pep-0008/) style guide for Python
+code. More information about the different structures and principles of the
+Fourmi application can be found on our [wiki](...).
+
+### To Do
+
+The Fourmi project has the following goals for the nearby future:
+
+** Main goals: **
+
+- Improve our documentation and guides. (Assignee: Dekker)
+
+- Build an graphical user interface(GUI) as alternative for the command line
+interface(CLI). (Assignee: Harmen)
+
+- Compiling the source into an windows executable. (Assignee: Bas)
+
+- Create an configuration file to hold logins and API keys.
+
+- Determine reliability of our data point.
+
+- Create an module to gather data from NIST. (Assignee: Rob)
+
+- Create an module to gather data from PubChem. (Assignee: Rob)
+
+** Side goals: **
+
+- Clean and unify data.
+
+- Extensive reliability analysis using statistical tests.
+
+- Test data with Descartes 1.
+
+### Project Origin
+
+The Fourmi project was started in February of 2014 as part of a software
+engineering course at the Radboud University for students studying Computer
+Science, Information Science or Artificial Intelligence. Students participate in
+a real software development project as part of the
+[Giphouse](http://www.giphouse.nl/).
+
+This particular project was started on behalf of Ivo B. Rietveld. As a chemist
+he was in need of an application to automatically search information on chemical
+substances and create an phase diagram. The so called "Descrates" project was
+split into two teams each creating a different application that has part of the
+functionality. We are the team Descartes 2 and as we were responsible for
+creating a web crawler, we've named our application Fourmi (Englis: Ants).
+
+The following people were part of the original team:
+
+- [Jip J. Dekker](http://jip.dekker.li)
+
+- Rob ten Berge
+
+- Harmen Prins
+
+- Bas van Berkel
+
+- Nout van Deijck
+
+- Michail Kuznetcov
\ No newline at end of file

From c380b740461d4b1d07482511f3dc8a2432df43cc Mon Sep 17 00:00:00 2001
From: "Jip J. Dekker" <jip@dekker.li>
Date: Tue, 13 May 2014 23:28:56 +0200
Subject: [PATCH 24/41] Making things bold, removing breaklines

---
 README.md | 17 ++---------------
 1 file changed, 2 insertions(+), 15 deletions(-)

diff --git a/README.md b/README.md
index 4732c56..af012fd 100644
--- a/README.md
+++ b/README.md
@@ -39,29 +39,21 @@ Fourmi application can be found on our [wiki](...).
 
 The Fourmi project has the following goals for the nearby future:
 
-** Main goals: **
+__Main goals:__
 
 - Improve our documentation and guides. (Assignee: Dekker)
-
 - Build an graphical user interface(GUI) as alternative for the command line
 interface(CLI). (Assignee: Harmen)
-
 - Compiling the source into an windows executable. (Assignee: Bas)
-
 - Create an configuration file to hold logins and API keys.
-
 - Determine reliability of our data point.
-
 - Create an module to gather data from NIST. (Assignee: Rob)
-
 - Create an module to gather data from PubChem. (Assignee: Rob)
 
-** Side goals: **
+__Side goals:__
 
 - Clean and unify data.
-
 - Extensive reliability analysis using statistical tests.
-
 - Test data with Descartes 1.
 
 ### Project Origin
@@ -82,13 +74,8 @@ creating a web crawler, we've named our application Fourmi (Englis: Ants).
 The following people were part of the original team:
 
 - [Jip J. Dekker](http://jip.dekker.li)
-
 - Rob ten Berge
-
 - Harmen Prins
-
 - Bas van Berkel
-
 - Nout van Deijck
-
 - Michail Kuznetcov
\ No newline at end of file

From 284d24c7830d96bf15f7386b9e4f0e13c9dbb0e6 Mon Sep 17 00:00:00 2001
From: "Jip J. Dekker" <jip@dekker.li>
Date: Tue, 13 May 2014 23:35:12 +0200
Subject: [PATCH 25/41] Bumped the version number

---
 fourmi.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fourmi.py b/fourmi.py
index a9c1d68..efa4e54 100755
--- a/fourmi.py
+++ b/fourmi.py
@@ -80,7 +80,7 @@ def search(docopt_arguments, source_loader):
 
 
 if __name__ == '__main__':
-    arguments = docopt.docopt(__doc__, version='Fourmi - V0.2.6')
+    arguments = docopt.docopt(__doc__, version='Fourmi - V0.3.0')
     loader = SourceLoader()
 
     if arguments["--include"]:

From ee92e25ab4b25dbed46fd823b8de1a54ca0ea0a4 Mon Sep 17 00:00:00 2001
From: "Jip J. Dekker" <jip@dekker.li>
Date: Tue, 13 May 2014 23:43:37 +0200
Subject: [PATCH 26/41] Fixed the right assignees

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index af012fd..e9150a6 100644
--- a/README.md
+++ b/README.md
@@ -48,7 +48,7 @@ interface(CLI). (Assignee: Harmen)
 - Create an configuration file to hold logins and API keys.
 - Determine reliability of our data point.
 - Create an module to gather data from NIST. (Assignee: Rob)
-- Create an module to gather data from PubChem. (Assignee: Rob)
+- Create an module to gather data from PubChem. (Assignee: Nout)
 
 __Side goals:__
 

From 50c79e3b1f7357b9100b3aefa097a036696ace2f Mon Sep 17 00:00:00 2001
From: Rob tB <r.tenberge@student.ru.nl>
Date: Wed, 14 May 2014 13:44:43 +0200
Subject: [PATCH 27/41] conditions in name (split by ' at ') are now moved to
 condition field for individual value page and aggregate data table

---
 FourmiCrawler/sources/NIST.py | 21 ++++++++++++++++++---
 1 file changed, 18 insertions(+), 3 deletions(-)

diff --git a/FourmiCrawler/sources/NIST.py b/FourmiCrawler/sources/NIST.py
index 2d3c672..5a9b544 100644
--- a/FourmiCrawler/sources/NIST.py
+++ b/FourmiCrawler/sources/NIST.py
@@ -73,12 +73,21 @@ class NIST(Source):
             data = []
             for td in tr.xpath('td'):
                 data.append(''.join(td.xpath('node()').extract()))
+
+            name = symbol_table[data[0]]
+            condition = ''
+
+            m = re.match(r'(.*) at (.*)', name)
+            if m:
+                name = m.group(1)
+                condition = m.group(2)
+
             result = Result({
-                'attribute': symbol_table[data[0]],
+                'attribute': name,
                 'value': data[1] + ' ' + data[2],
                 'source': 'NIST',
                 'reliability': 'Unknown',
-                'conditions': ''
+                'conditions': condition
             })
             log.msg('NIST: |%s|' % data, level=log.DEBUG)
             results.append(result)
@@ -158,6 +167,12 @@ class NIST(Source):
         results = []
 
         name = table.xpath('@summary').extract()[0]
+        condition = ''
+        m = re.match(r'(.*) at (.*)', name)
+        if m:
+            name = m.group(1)
+            condition = m.group(2)
+
         tr_unit = ''.join(table.xpath('tr[1]/th[1]/node()').extract())
         m = re.search(r'\((.*)\)', tr_unit)
         unit = '!'
@@ -176,7 +191,7 @@ class NIST(Source):
                 'value': '%s %s%s' % (tds[0], uncertainty, unit),
                 'source': 'NIST',
                 'reliability': 'Unknown',
-                'conditions': ''
+                'conditions': condition
             })
             results.append(result)
 

From 98f58ea4e26f6ef8dfbd44467b3d47ebee64d283 Mon Sep 17 00:00:00 2001
From: Rob tB <r.tenberge@student.ru.nl>
Date: Thu, 15 May 2014 14:29:28 +0200
Subject: [PATCH 28/41] added scraping for generic info except for synonyms

---
 FourmiCrawler/sources/NIST.py | 37 +++++++++++++++++++++++++++++++++++
 1 file changed, 37 insertions(+)

diff --git a/FourmiCrawler/sources/NIST.py b/FourmiCrawler/sources/NIST.py
index 5a9b544..ddb7a09 100644
--- a/FourmiCrawler/sources/NIST.py
+++ b/FourmiCrawler/sources/NIST.py
@@ -18,6 +18,8 @@ class NIST(Source):
 
         requests = []
 
+        requests.extend(self.parse_generic_info(sel))
+
         symbol_table = {}
         tds = sel.xpath('//table[@class="symbol_table"]/tr/td')
         for (symbol_td, name_td) in zip(tds[::2], tds[1::2]):
@@ -60,6 +62,41 @@ class NIST(Source):
                 continue #Assume unsupported
         return requests
 
+    def parse_generic_info(self, sel):
+        ul = sel.xpath('body/ul[li/strong="IUPAC Standard InChI:"]')
+        li = ul.xpath('li')
+
+        data = {}
+
+        raw_formula = ul.xpath('li[strong/a="Formula"]//text()').extract()
+        data['Chemical formula'] = ''.join(raw_formula[2:]).strip()
+
+        raw_mol_weight = ul.xpath('li[strong/a="Molecular weight"]/text()')
+        data['Molecular weight'] = raw_mol_weight.extract()[0].strip()
+
+        raw_inchi = ul.xpath('li[strong="IUPAC Standard InChI:"]//tt/text()')
+        data['IUPAC Standard InChI'] = raw_inchi.extract()[0]
+
+        raw_inchikey = ul.xpath('li[strong="IUPAC Standard InChIKey:"]'
+                            '/tt/text()')
+        data['IUPAC Standard InChIKey'] = raw_inchikey.extract()[0]
+
+        raw_cas_number = ul.xpath('li[strong="CAS Registry Number:"]/text()')
+        data['CAS Registry Number'] = raw_cas_number.extract()[0].strip()
+
+        requests = []
+        for key, value in data.iteritems():
+            result = Result({
+                'attribute': key,
+                'value': value,
+                'source': 'NIST',
+                'reliability': 'Unknown',
+                'conditions': ''
+            })
+            requests.append(result)
+
+        return requests
+
     def parse_aggregate_data(self, table, symbol_table):
         results = []
         for tr in table.xpath('tr[td]'):

From 56ee6b1ad347c475ef24f077377c35577b4fbfc5 Mon Sep 17 00:00:00 2001
From: RTB <robtberge@gmail.com>
Date: Sat, 17 May 2014 14:09:10 +0200
Subject: [PATCH 29/41] added ignore list

---
 FourmiCrawler/sources/NIST.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/FourmiCrawler/sources/NIST.py b/FourmiCrawler/sources/NIST.py
index ddb7a09..17552e5 100644
--- a/FourmiCrawler/sources/NIST.py
+++ b/FourmiCrawler/sources/NIST.py
@@ -5,11 +5,15 @@ from scrapy.selector import Selector
 from FourmiCrawler.items import Result
 import re
 
+# [TODO]: values can be '128.', perhaps remove the dot in that case?
+
 class NIST(Source):
     website = "http://webbook.nist.gov/*"  
 
     search = 'cgi/cbook.cgi?Name=%s&Units=SI&cTP=on'
 
+    ignore_list = set()
+
     def __init__(self):
         Source.__init__(self)
 
@@ -235,5 +239,7 @@ class NIST(Source):
         return results
 
     def new_compound_request(self, compound):
-        return Request(url=self.website[:-1] + self.search % compound,
-                       callback=self.parse)
+        if compound not in self.ignore_list:
+            self.ignore_list.update(compound)
+            return Request(url=self.website[:-1] + self.search % compound,
+                           callback=self.parse)

From afc1106838120503bd4cde2af80781e3e2738c9d Mon Sep 17 00:00:00 2001
From: RTB <robtberge@gmail.com>
Date: Sat, 17 May 2014 14:11:03 +0200
Subject: [PATCH 30/41] NIST now logs an error if chemical name is not found

---
 FourmiCrawler/sources/NIST.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/FourmiCrawler/sources/NIST.py b/FourmiCrawler/sources/NIST.py
index 17552e5..d5eaa76 100644
--- a/FourmiCrawler/sources/NIST.py
+++ b/FourmiCrawler/sources/NIST.py
@@ -20,6 +20,11 @@ class NIST(Source):
     def parse(self, response):
         sel = Selector(response)
 
+        title = sel.xpath('head/title/text()').extract()[0]
+        if title == 'Name Not Found':
+            log.msg('NIST: Chemical not found!', level=log.ERROR)
+            return
+
         requests = []
 
         requests.extend(self.parse_generic_info(sel))

From b46c7a309d8132efbed111ba13d4d52fa122d70f Mon Sep 17 00:00:00 2001
From: RTB <robtberge@gmail.com>
Date: Sat, 17 May 2014 14:21:11 +0200
Subject: [PATCH 31/41] if synonym name matched in search instead of primary
 name, emit primary name as synonym

---
 FourmiCrawler/sources/NIST.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/FourmiCrawler/sources/NIST.py b/FourmiCrawler/sources/NIST.py
index d5eaa76..a969384 100644
--- a/FourmiCrawler/sources/NIST.py
+++ b/FourmiCrawler/sources/NIST.py
@@ -24,6 +24,10 @@ class NIST(Source):
         if title == 'Name Not Found':
             log.msg('NIST: Chemical not found!', level=log.ERROR)
             return
+        if title not in self.ignore_list:
+            self.ignore_list.update(title)
+            log.msg('NIST emit synonym: %s' % title, level=log.DEBUG)
+            self._spider.get_synonym_requests(title)
 
         requests = []
 

From 472aae86be443c8372c356b0509e13cabd3b1c9d Mon Sep 17 00:00:00 2001
From: RTB <robtberge@gmail.com>
Date: Sat, 17 May 2014 19:32:20 +0200
Subject: [PATCH 32/41] synonyms are now scraped

---
 FourmiCrawler/sources/NIST.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/FourmiCrawler/sources/NIST.py b/FourmiCrawler/sources/NIST.py
index a969384..4bb8e30 100644
--- a/FourmiCrawler/sources/NIST.py
+++ b/FourmiCrawler/sources/NIST.py
@@ -79,6 +79,12 @@ class NIST(Source):
         ul = sel.xpath('body/ul[li/strong="IUPAC Standard InChI:"]')
         li = ul.xpath('li')
 
+        raw_synonyms = ul.xpath('li[strong="Other names:"]/text()').extract()
+        for synonym in raw_synonyms[0].strip().split(';\n'):
+            log.msg('NIST synonym: %s' % synonym, level=log.DEBUG)
+            self.ignore_list.update(synonym)
+            self._spider.get_synonym_requests(synonym)
+
         data = {}
 
         raw_formula = ul.xpath('li[strong/a="Formula"]//text()').extract()

From 81719a38fbb72a4344cd25c49c7361e8907779e6 Mon Sep 17 00:00:00 2001
From: RTB <robtberge@gmail.com>
Date: Tue, 20 May 2014 19:32:06 +0200
Subject: [PATCH 33/41] Added comments for the class and functions

---
 FourmiCrawler/sources/NIST.py | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/FourmiCrawler/sources/NIST.py b/FourmiCrawler/sources/NIST.py
index 4bb8e30..a2fb425 100644
--- a/FourmiCrawler/sources/NIST.py
+++ b/FourmiCrawler/sources/NIST.py
@@ -6,8 +6,15 @@ from FourmiCrawler.items import Result
 import re
 
 # [TODO]: values can be '128.', perhaps remove the dot in that case?
+# [TODO]: properties have references and comments which do not exist in the
+#         Result item, but should be included eventually.
 
 class NIST(Source):
+    """NIST Scraper plugin
+
+    This plugin manages searching for a chemical on the NIST website
+    and parsing the resulting page if the chemical exists on NIST.
+    """
     website = "http://webbook.nist.gov/*"  
 
     search = 'cgi/cbook.cgi?Name=%s&Units=SI&cTP=on'
@@ -76,6 +83,9 @@ class NIST(Source):
         return requests
 
     def parse_generic_info(self, sel):
+        """Parses: synonyms, chemical formula, molecular weight, InChI,
+        InChiKey, CAS number
+        """
         ul = sel.xpath('body/ul[li/strong="IUPAC Standard InChI:"]')
         li = ul.xpath('li')
 
@@ -117,6 +127,9 @@ class NIST(Source):
         return requests
 
     def parse_aggregate_data(self, table, symbol_table):
+        """Parses the table(s) which contain possible links to individual
+        data points
+        """
         results = []
         for tr in table.xpath('tr[td]'):
             extra_data_url = tr.xpath('td[last()][a="Individual data points"]'
@@ -151,6 +164,7 @@ class NIST(Source):
 
     @staticmethod
     def parse_transition_data(table, symbol_table):
+        """Parses the table containing properties regarding phase changes"""
         results = []
 
         name = table.xpath('@summary').extract()[0]
@@ -176,6 +190,11 @@ class NIST(Source):
 
     @staticmethod
     def parse_generic_data(table):
+        """Parses the common tables of 4 and 5 rows. Assumes they are of the
+        form:
+        Symbol (unit)|Temperature (K)|Method|Reference|Comment
+        Symbol (unit)|Temperature (K)|Reference|Comment
+        """
         results = []
 
         name = table.xpath('@summary').extract()[0]
@@ -199,6 +218,7 @@ class NIST(Source):
 
     @staticmethod
     def parse_antoine_data(table):
+        """Parse table containing parameters for the Antione equation"""
         results = []
 
         name = table.xpath('@summary').extract()[0]
@@ -217,6 +237,7 @@ class NIST(Source):
         return results
 
     def parse_individual_datapoints(self, response):
+        """Parses the page linked from aggregate data"""
         sel = Selector(response)
         table = sel.xpath('//table[@class="data"]')[0]
 

From 95565042cae6ba0eec9a58ff4308fa6e18a28765 Mon Sep 17 00:00:00 2001
From: Rob tB <r.tenberge@student.ru.nl>
Date: Wed, 21 May 2014 10:22:03 +0200
Subject: [PATCH 34/41] removed unused variable symbol_table from
 parse_transition_table

---
 FourmiCrawler/sources/NIST.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/FourmiCrawler/sources/NIST.py b/FourmiCrawler/sources/NIST.py
index a2fb425..7770efc 100644
--- a/FourmiCrawler/sources/NIST.py
+++ b/FourmiCrawler/sources/NIST.py
@@ -57,8 +57,7 @@ class NIST(Source):
             elif tables.xpath('tr/th="Initial Phase"').extract()[0] == '1':
                 log.msg('NIST table; Enthalpy/entropy of phase transition',
                         level=log.DEBUG)
-                requests.extend(
-                    self.parse_transition_data(tables, symbol_table))
+                requests.extend(self.parse_transition_data(tables))
             elif tables.xpath('tr[1]/td'):
                 log.msg('NIST table: Horizontal table', level=log.DEBUG)
             elif (tables.xpath('@summary').extract()[0] ==
@@ -163,7 +162,7 @@ class NIST(Source):
         return results
 
     @staticmethod
-    def parse_transition_data(table, symbol_table):
+    def parse_transition_data(table):
         """Parses the table containing properties regarding phase changes"""
         results = []
 

From 429ffd74221ff2262dde1ab1113bbbfaffe23001 Mon Sep 17 00:00:00 2001
From: Rob tB <r.tenberge@student.ru.nl>
Date: Wed, 21 May 2014 10:28:54 +0200
Subject: [PATCH 35/41] renamed tables to table in parse()

---
 FourmiCrawler/sources/NIST.py | 27 ++++++++++++---------------
 1 file changed, 12 insertions(+), 15 deletions(-)

diff --git a/FourmiCrawler/sources/NIST.py b/FourmiCrawler/sources/NIST.py
index 7770efc..d04b0d0 100644
--- a/FourmiCrawler/sources/NIST.py
+++ b/FourmiCrawler/sources/NIST.py
@@ -49,33 +49,30 @@ class NIST(Source):
             log.msg('NIST symbol: |%s|, name: |%s|' % (symbol, name),
                     level=log.DEBUG)
 
-        for tables in sel.xpath('//table[@class="data"]'):
-            if tables.xpath('@summary').extract()[0] == 'One dimensional data':
+        for table in sel.xpath('//table[@class="data"]'):
+            if table.xpath('@summary').extract()[0] == 'One dimensional data':
                 log.msg('NIST table: Aggregrate data', level=log.DEBUG)
                 requests.extend(
-                    self.parse_aggregate_data(tables, symbol_table))
-            elif tables.xpath('tr/th="Initial Phase"').extract()[0] == '1':
+                    self.parse_aggregate_data(table, symbol_table))
+            elif table.xpath('tr/th="Initial Phase"').extract()[0] == '1':
                 log.msg('NIST table; Enthalpy/entropy of phase transition',
                         level=log.DEBUG)
-                requests.extend(self.parse_transition_data(tables))
-            elif tables.xpath('tr[1]/td'):
+                requests.extend(self.parse_transition_data(table))
+            elif table.xpath('tr[1]/td'):
                 log.msg('NIST table: Horizontal table', level=log.DEBUG)
-            elif (tables.xpath('@summary').extract()[0] ==
+            elif (table.xpath('@summary').extract()[0] ==
                     'Antoine Equation Parameters'):
                 log.msg('NIST table: Antoine Equation Parameters',
                         level=log.DEBUG)
-                requests.extend(
-                    self.parse_antoine_data(tables))
-            elif len(tables.xpath('tr[1]/th')) == 5:
+                requests.extend(self.parse_antoine_data(table))
+            elif len(table.xpath('tr[1]/th')) == 5:
                 log.msg('NIST table: generic 5 columns', level=log.DEBUG)
                 # Symbol (unit) Temperature (K) Method Reference Comment
-                requests.extend(
-                    self.parse_generic_data(tables))
-            elif len(tables.xpath('tr[1]/th')) == 4:
+                requests.extend(self.parse_generic_data(table))
+            elif len(table.xpath('tr[1]/th')) == 4:
                 log.msg('NIST table: generic 4 columns', level=log.DEBUG)
                 # Symbol (unit) Temperature (K) Reference Comment
-                requests.extend(
-                    self.parse_generic_data(tables))
+                requests.extend(self.parse_generic_data(table))
             else:
                 log.msg('NIST table: NOT SUPPORTED', level=log.WARNING)
                 continue #Assume unsupported

From c0af24644b922eaa3223b7889ae094cd83290e1c Mon Sep 17 00:00:00 2001
From: Rob tB <r.tenberge@student.ru.nl>
Date: Wed, 21 May 2014 10:31:19 +0200
Subject: [PATCH 36/41] added summary variable in parse()

---
 FourmiCrawler/sources/NIST.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/FourmiCrawler/sources/NIST.py b/FourmiCrawler/sources/NIST.py
index d04b0d0..cadf6dc 100644
--- a/FourmiCrawler/sources/NIST.py
+++ b/FourmiCrawler/sources/NIST.py
@@ -50,7 +50,8 @@ class NIST(Source):
                     level=log.DEBUG)
 
         for table in sel.xpath('//table[@class="data"]'):
-            if table.xpath('@summary').extract()[0] == 'One dimensional data':
+            summary = table.xpath('@summary').extract()[0]
+            if summary == 'One dimensional data':
                 log.msg('NIST table: Aggregrate data', level=log.DEBUG)
                 requests.extend(
                     self.parse_aggregate_data(table, symbol_table))
@@ -60,8 +61,7 @@ class NIST(Source):
                 requests.extend(self.parse_transition_data(table))
             elif table.xpath('tr[1]/td'):
                 log.msg('NIST table: Horizontal table', level=log.DEBUG)
-            elif (table.xpath('@summary').extract()[0] ==
-                    'Antoine Equation Parameters'):
+            elif summary == 'Antoine Equation Parameters'):
                 log.msg('NIST table: Antoine Equation Parameters',
                         level=log.DEBUG)
                 requests.extend(self.parse_antoine_data(table))

From 6cd8edaf222f2bdfdc97ab59de914cc65d05369e Mon Sep 17 00:00:00 2001
From: Rob tB <r.tenberge@student.ru.nl>
Date: Wed, 21 May 2014 10:36:42 +0200
Subject: [PATCH 37/41] included summary variable in call to transition_table,
 antoine table and generic table

---
 FourmiCrawler/sources/NIST.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/FourmiCrawler/sources/NIST.py b/FourmiCrawler/sources/NIST.py
index cadf6dc..76caca3 100644
--- a/FourmiCrawler/sources/NIST.py
+++ b/FourmiCrawler/sources/NIST.py
@@ -58,21 +58,21 @@ class NIST(Source):
             elif table.xpath('tr/th="Initial Phase"').extract()[0] == '1':
                 log.msg('NIST table; Enthalpy/entropy of phase transition',
                         level=log.DEBUG)
-                requests.extend(self.parse_transition_data(table))
+                requests.extend(self.parse_transition_data(table, summary))
             elif table.xpath('tr[1]/td'):
                 log.msg('NIST table: Horizontal table', level=log.DEBUG)
-            elif summary == 'Antoine Equation Parameters'):
+            elif summary == 'Antoine Equation Parameters':
                 log.msg('NIST table: Antoine Equation Parameters',
                         level=log.DEBUG)
-                requests.extend(self.parse_antoine_data(table))
+                requests.extend(self.parse_antoine_data(table, summary))
             elif len(table.xpath('tr[1]/th')) == 5:
                 log.msg('NIST table: generic 5 columns', level=log.DEBUG)
                 # Symbol (unit) Temperature (K) Method Reference Comment
-                requests.extend(self.parse_generic_data(table))
+                requests.extend(self.parse_generic_data(table, summary))
             elif len(table.xpath('tr[1]/th')) == 4:
                 log.msg('NIST table: generic 4 columns', level=log.DEBUG)
                 # Symbol (unit) Temperature (K) Reference Comment
-                requests.extend(self.parse_generic_data(table))
+                requests.extend(self.parse_generic_data(table, summary))
             else:
                 log.msg('NIST table: NOT SUPPORTED', level=log.WARNING)
                 continue #Assume unsupported
@@ -159,7 +159,7 @@ class NIST(Source):
         return results
 
     @staticmethod
-    def parse_transition_data(table):
+    def parse_transition_data(table, summary):
         """Parses the table containing properties regarding phase changes"""
         results = []
 
@@ -185,7 +185,7 @@ class NIST(Source):
         return results
 
     @staticmethod
-    def parse_generic_data(table):
+    def parse_generic_data(table, summary):
         """Parses the common tables of 4 and 5 rows. Assumes they are of the
         form:
         Symbol (unit)|Temperature (K)|Method|Reference|Comment
@@ -213,7 +213,7 @@ class NIST(Source):
         return results
 
     @staticmethod
-    def parse_antoine_data(table):
+    def parse_antoine_data(table, summary):
         """Parse table containing parameters for the Antione equation"""
         results = []
 

From 6ce5ff23359786ae24274fcac3e11a7934a453e6 Mon Sep 17 00:00:00 2001
From: Rob tB <r.tenberge@student.ru.nl>
Date: Wed, 21 May 2014 10:40:44 +0200
Subject: [PATCH 38/41] replaced name variable with summary variable

---
 FourmiCrawler/sources/NIST.py | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/FourmiCrawler/sources/NIST.py b/FourmiCrawler/sources/NIST.py
index 76caca3..0b75b17 100644
--- a/FourmiCrawler/sources/NIST.py
+++ b/FourmiCrawler/sources/NIST.py
@@ -163,7 +163,6 @@ class NIST(Source):
         """Parses the table containing properties regarding phase changes"""
         results = []
 
-        name = table.xpath('@summary').extract()[0]
         tr_unit = ''.join(table.xpath('tr[1]/th[1]/node()').extract())
         m = re.search(r'\((.*)\)', tr_unit)
         unit = '!'
@@ -173,7 +172,7 @@ class NIST(Source):
         for tr in table.xpath('tr[td]'):
             tds = tr.xpath('td/text()').extract()
             result = Result({
-                'attribute': name,
+                'attribute': summary,
                 'value': tds[0] + ' ' + unit,
                 'source': 'NIST',
                 'reliability': 'Unknown',
@@ -193,7 +192,6 @@ class NIST(Source):
         """
         results = []
 
-        name = table.xpath('@summary').extract()[0]
         tr_unit = ''.join(table.xpath('tr[1]/th[1]/node()').extract())
         m = re.search(r'\((.*)\)', tr_unit)
         unit = '!'
@@ -203,7 +201,7 @@ class NIST(Source):
         for tr in table.xpath('tr[td]'):
             tds = tr.xpath('td/text()').extract()
             result = Result({
-                'attribute': name,
+                'attribute': summary,
                 'value': tds[0] + ' ' + unit,
                 'source': 'NIST',
                 'reliability': 'Unknown',
@@ -217,12 +215,10 @@ class NIST(Source):
         """Parse table containing parameters for the Antione equation"""
         results = []
 
-        name = table.xpath('@summary').extract()[0]
-
         for tr in table.xpath('tr[td]'):
             tds = tr.xpath('td/text()').extract()
             result = Result({
-                'attribute': name,
+                'attribute': summary,
                 'value': 'A=%s, B=%s, C=%s' % (tds[1], tds[2], tds[3]),
                 'source': 'NIST',
                 'reliability': 'Unknown',

From 98f91a1aa9b8b0fdce5a6c9de903695a46d0dcc9 Mon Sep 17 00:00:00 2001
From: "Jip J. Dekker" <jip@dekker.li>
Date: Thu, 22 May 2014 12:15:43 +0200
Subject: [PATCH 39/41] Added a pipeline to replace None values with empty
 strings

---
 FourmiCrawler/pipelines.py | 16 ++++++++++++++++
 FourmiCrawler/settings.py  |  5 +++--
 2 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/FourmiCrawler/pipelines.py b/FourmiCrawler/pipelines.py
index e1dadbf..2c775f2 100644
--- a/FourmiCrawler/pipelines.py
+++ b/FourmiCrawler/pipelines.py
@@ -5,6 +5,22 @@
 import re
 from scrapy.exceptions import DropItem
 
+class RemoveNonePipeline(object):
+
+    def __init__(self):
+        self.known_values = set()
+
+    def process_item(self, item, spider):
+        """
+        Processing the items so None values are replaced by empty strings
+        :param item: The incoming item
+        :param spider: The spider which scraped the spider
+        :return: :raise DropItem: Returns the item if unique or drops them if it's already known
+        """
+        for key in item:
+            if item[key] is None:
+                item[key] = ""
+        return item
 
 class DuplicatePipeline(object):
 
diff --git a/FourmiCrawler/settings.py b/FourmiCrawler/settings.py
index d7ac212..be7c451 100644
--- a/FourmiCrawler/settings.py
+++ b/FourmiCrawler/settings.py
@@ -11,8 +11,9 @@ BOT_NAME = 'FourmiCrawler'
 SPIDER_MODULES = ['FourmiCrawler']
 NEWSPIDER_MODULE = 'FourmiCrawler'
 ITEM_PIPELINES = {
-    'FourmiCrawler.pipelines.AttributeSelectionPipeline': 100,
-    'FourmiCrawler.pipelines.DuplicatePipeline': 200,
+    "FourmiCrawler.pipelines.RemoveNonePipeline": 100,
+    'FourmiCrawler.pipelines.AttributeSelectionPipeline': 200,
+    'FourmiCrawler.pipelines.DuplicatePipeline': 300,
 }
 FEED_URI = 'results.json'
 FEED_FORMAT = 'jsonlines'

From f80a32a0dcb0e5216d3301969e6f360fc1d8cc31 Mon Sep 17 00:00:00 2001
From: "Jip J. Dekker" <jip@dekker.li>
Date: Thu, 22 May 2014 12:19:41 +0200
Subject: [PATCH 40/41] Pushed the version number

---
 fourmi.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fourmi.py b/fourmi.py
index efa4e54..08e010b 100755
--- a/fourmi.py
+++ b/fourmi.py
@@ -80,7 +80,7 @@ def search(docopt_arguments, source_loader):
 
 
 if __name__ == '__main__':
-    arguments = docopt.docopt(__doc__, version='Fourmi - V0.3.0')
+    arguments = docopt.docopt(__doc__, version='Fourmi - V0.3.1')
     loader = SourceLoader()
 
     if arguments["--include"]:

From 7a4696aed4def4bb00f5a06a76f0a3a1249f86f8 Mon Sep 17 00:00:00 2001
From: "Jip J. Dekker" <jip@dekker.li>
Date: Fri, 23 May 2014 13:05:01 +0200
Subject: [PATCH 41/41] Tested and pushed version number

---
 fourmi.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fourmi.py b/fourmi.py
index 08e010b..6d5b8e8 100755
--- a/fourmi.py
+++ b/fourmi.py
@@ -80,7 +80,7 @@ def search(docopt_arguments, source_loader):
 
 
 if __name__ == '__main__':
-    arguments = docopt.docopt(__doc__, version='Fourmi - V0.3.1')
+    arguments = docopt.docopt(__doc__, version='Fourmi - V0.4.0')
     loader = SourceLoader()
 
     if arguments["--include"]: