From 95e9675605deed13f3b2f53ffb47525fadcdbf17 Mon Sep 17 00:00:00 2001
From: Rob tB <r.tenberge@student.ru.nl>
Date: Thu, 1 May 2014 14:57:09 +0200
Subject: [PATCH 01/39] created stub for NIST parser

---
 FourmiCrawler/sources/NIST.py | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)
 create mode 100644 FourmiCrawler/sources/NIST.py

diff --git a/FourmiCrawler/sources/NIST.py b/FourmiCrawler/sources/NIST.py
new file mode 100644
index 0000000..57befff
--- /dev/null
+++ b/FourmiCrawler/sources/NIST.py
@@ -0,0 +1,16 @@
+from scrapy import log
+# from scrapy.http import Request
+
+
+class NIST(Source):
+    website = "http://webbook.nist.gov/*"  
+
+    def __init__(self):
+        Source.__init__(self)
+
+    def parse(self, reponse):
+        pass
+
+    def new_compound_request(self, compound):
+        # return Request(url=self.website[:-1] + compound, callback=self.parse)
+        pass

From e1e507f745b1dbc16a15b460594f69a3842ff2fe Mon Sep 17 00:00:00 2001
From: Rob tB <r.tenberge@student.ru.nl>
Date: Thu, 1 May 2014 15:30:28 +0200
Subject: [PATCH 02/39] added several required imports

---
 FourmiCrawler/sources/NIST.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/FourmiCrawler/sources/NIST.py b/FourmiCrawler/sources/NIST.py
index 57befff..cbcbeb4 100644
--- a/FourmiCrawler/sources/NIST.py
+++ b/FourmiCrawler/sources/NIST.py
@@ -1,5 +1,8 @@
+from source import Source
 from scrapy import log
-# from scrapy.http import Request
+from scrapy.http import Request
+from scrapy.selector import Selector
+from FourmiCrawler.items import Result
 
 
 class NIST(Source):

From 0cec4bd2d8960bb9ffa1e8b3f99f3230bb694580 Mon Sep 17 00:00:00 2001
From: Rob tB <r.tenberge@student.ru.nl>
Date: Sun, 4 May 2014 20:48:51 +0200
Subject: [PATCH 03/39] new_compound_request now returns a Request with a searh
 URL

---
 FourmiCrawler/sources/NIST.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/FourmiCrawler/sources/NIST.py b/FourmiCrawler/sources/NIST.py
index cbcbeb4..2c4337c 100644
--- a/FourmiCrawler/sources/NIST.py
+++ b/FourmiCrawler/sources/NIST.py
@@ -8,6 +8,8 @@ from FourmiCrawler.items import Result
 class NIST(Source):
     website = "http://webbook.nist.gov/*"  
 
+    search = 'cgi/cbook.cgi?Name=%s&Units=SI&cTG=on&cTC=on&cTP=on'
+
     def __init__(self):
         Source.__init__(self)
 
@@ -15,5 +17,5 @@ class NIST(Source):
         pass
 
     def new_compound_request(self, compound):
-        # return Request(url=self.website[:-1] + compound, callback=self.parse)
-        pass
+        return Request(url=self.website[:-1] + self.search % compound,
+                       callback=self.parse)

From 930eb6cad588d49a46b2dea51d0cbe72565c4763 Mon Sep 17 00:00:00 2001
From: Rob tB <r.tenberge@student.ru.nl>
Date: Sun, 4 May 2014 21:20:46 +0200
Subject: [PATCH 04/39] NIST now scrapes the symbol table for later use

---
 FourmiCrawler/sources/NIST.py | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/FourmiCrawler/sources/NIST.py b/FourmiCrawler/sources/NIST.py
index 2c4337c..44a8037 100644
--- a/FourmiCrawler/sources/NIST.py
+++ b/FourmiCrawler/sources/NIST.py
@@ -13,8 +13,17 @@ class NIST(Source):
     def __init__(self):
         Source.__init__(self)
 
-    def parse(self, reponse):
-        pass
+    def parse(self, response):
+        sel = Selector(response)
+
+        symbol_table = {}
+        tds = sel.xpath('//table[@class="symbol_table"]/tr/td')
+        for (symbol_td, name_td) in zip(tds[::2], tds[1::2]):
+            symbol = ''.join(symbol_td.xpath('node()').extract())
+            name = name_td.xpath('text()').extract()[0]
+            symbol_table[symbol] = name
+            log.msg('NIST symbol: |%s|, name: |%s|' % (symbol, name),
+                    level=log.DEBUG)
 
     def new_compound_request(self, compound):
         return Request(url=self.website[:-1] + self.search % compound,

From 9c80f291b6230f2a0d3958773df33f9f6e1e6f82 Mon Sep 17 00:00:00 2001
From: RTB <robtberge@gmail.com>
Date: Wed, 7 May 2014 13:27:22 +0200
Subject: [PATCH 05/39] search NIST exclusively for phase change data

---
 FourmiCrawler/sources/NIST.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/FourmiCrawler/sources/NIST.py b/FourmiCrawler/sources/NIST.py
index 44a8037..cd049a0 100644
--- a/FourmiCrawler/sources/NIST.py
+++ b/FourmiCrawler/sources/NIST.py
@@ -8,7 +8,7 @@ from FourmiCrawler.items import Result
 class NIST(Source):
     website = "http://webbook.nist.gov/*"  
 
-    search = 'cgi/cbook.cgi?Name=%s&Units=SI&cTG=on&cTC=on&cTP=on'
+    search = 'cgi/cbook.cgi?Name=%s&Units=SI&cTP=on'
 
     def __init__(self):
         Source.__init__(self)

From 95e24f9c44778a18d764e6bfc09d99d16b2fdb87 Mon Sep 17 00:00:00 2001
From: RTB <robtberge@gmail.com>
Date: Wed, 7 May 2014 17:09:42 +0200
Subject: [PATCH 06/39] added code to recognize various table formats

---
 FourmiCrawler/sources/NIST.py | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/FourmiCrawler/sources/NIST.py b/FourmiCrawler/sources/NIST.py
index cd049a0..37f8d04 100644
--- a/FourmiCrawler/sources/NIST.py
+++ b/FourmiCrawler/sources/NIST.py
@@ -25,6 +25,28 @@ class NIST(Source):
             log.msg('NIST symbol: |%s|, name: |%s|' % (symbol, name),
                     level=log.DEBUG)
 
+        for tables in sel.xpath('//table[@class="data"]'):
+            if tables.xpath('@summary').extract()[0] == 'One dimensional data':
+                log.msg('NIST table: Aggregrate data', level=log.DEBUG)
+            elif tables.xpath('tr/th="Initial Phase"').extract()[0] == '1':
+                log.msg('NIST table; Enthalpy/entropy of phase transition',
+                        level=log.DEBUG)
+            elif tables.xpath('tr[1]/td'):
+                log.msg('NIST table: Horizontal table', level=log.DEBUG)
+            elif (tables.xpath('@summary').extract()[0] ==
+                    'Antoine Equation Parameters'):
+                log.msg('NIST table: Antoine Equation Parameters',
+                        level=log.DEBUG)
+            elif len(tables.xpath('tr[1]/th')) == 5:
+                log.msg('NIST table: generic 5 columns', level=log.DEBUG)
+                # Symbol (unit) Temperature (K) Method Reference Comment
+            elif len(tables.xpath('tr[1]/th')) == 4:
+                log.msg('NIST table: generic 4 columns', level=log.DEBUG)
+                # Symbol (unit) Temperature (K) Reference Comment
+            else:
+                log.msg('NIST table: NOT SUPPORTED', level=log.WARNING)
+                continue #Assume unsupported
+
     def new_compound_request(self, compound):
         return Request(url=self.website[:-1] + self.search % compound,
                        callback=self.parse)

From 85595ecf350e173692420f91eb439571444fed86 Mon Sep 17 00:00:00 2001
From: RTB <robtberge@gmail.com>
Date: Wed, 7 May 2014 18:12:08 +0200
Subject: [PATCH 07/39] created function to start scraping the aggregate data
 table

---
 FourmiCrawler/sources/NIST.py | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/FourmiCrawler/sources/NIST.py b/FourmiCrawler/sources/NIST.py
index 37f8d04..6e884ef 100644
--- a/FourmiCrawler/sources/NIST.py
+++ b/FourmiCrawler/sources/NIST.py
@@ -16,6 +16,8 @@ class NIST(Source):
     def parse(self, response):
         sel = Selector(response)
 
+        requests = []
+
         symbol_table = {}
         tds = sel.xpath('//table[@class="symbol_table"]/tr/td')
         for (symbol_td, name_td) in zip(tds[::2], tds[1::2]):
@@ -28,6 +30,8 @@ class NIST(Source):
         for tables in sel.xpath('//table[@class="data"]'):
             if tables.xpath('@summary').extract()[0] == 'One dimensional data':
                 log.msg('NIST table: Aggregrate data', level=log.DEBUG)
+                requests.extend(
+                    self.parse_aggregate_data(tables, symbol_table))
             elif tables.xpath('tr/th="Initial Phase"').extract()[0] == '1':
                 log.msg('NIST table; Enthalpy/entropy of phase transition',
                         level=log.DEBUG)
@@ -46,6 +50,25 @@ class NIST(Source):
             else:
                 log.msg('NIST table: NOT SUPPORTED', level=log.WARNING)
                 continue #Assume unsupported
+        return requests
+
+    @staticmethod
+    def parse_aggregate_data(table, symbol_table):
+        results = []
+        for tr in table.xpath('tr[td]'):
+            data = []
+            for td in tr.xpath('td'):
+                data.append(''.join(td.xpath('node()').extract()))
+            result = Result({
+                'attribute': symbol_table[data[0]],
+                'value': data[1] + ' ' + data[2],
+                'source': 'NIST',
+                'reliability': 'Unknown',
+                'conditions': ''
+            })
+            log.msg('NIST: |%s|' % data, level=log.DEBUG)
+            results.append(result)
+        return results
 
     def new_compound_request(self, compound):
         return Request(url=self.website[:-1] + self.search % compound,

From 10dd74e02617401f80e32c29dd95513fe93248a7 Mon Sep 17 00:00:00 2001
From: RTB <robtberge@gmail.com>
Date: Wed, 7 May 2014 21:58:52 +0200
Subject: [PATCH 08/39] added function to scrape transition tables

---
 FourmiCrawler/sources/NIST.py | 26 +++++++++++++++++++++++++-
 1 file changed, 25 insertions(+), 1 deletion(-)

diff --git a/FourmiCrawler/sources/NIST.py b/FourmiCrawler/sources/NIST.py
index 6e884ef..0191fee 100644
--- a/FourmiCrawler/sources/NIST.py
+++ b/FourmiCrawler/sources/NIST.py
@@ -3,7 +3,7 @@ from scrapy import log
 from scrapy.http import Request
 from scrapy.selector import Selector
 from FourmiCrawler.items import Result
-
+import re
 
 class NIST(Source):
     website = "http://webbook.nist.gov/*"  
@@ -35,6 +35,8 @@ class NIST(Source):
             elif tables.xpath('tr/th="Initial Phase"').extract()[0] == '1':
                 log.msg('NIST table; Enthalpy/entropy of phase transition',
                         level=log.DEBUG)
+                requests.extend(
+                    self.parse_transition_data(tables, symbol_table))
             elif tables.xpath('tr[1]/td'):
                 log.msg('NIST table: Horizontal table', level=log.DEBUG)
             elif (tables.xpath('@summary').extract()[0] ==
@@ -70,6 +72,28 @@ class NIST(Source):
             results.append(result)
         return results
 
+    @staticmethod
+    def parse_transition_data(table, symbol_table):
+        results = []
+
+        name = table.xpath('@summary').extract()[0]
+        unit = table.xpath('tr[1]/th[1]/node()').extract()[-1][2:-1]
+
+        for tr in table.xpath('tr[td]'):
+            tds = tr.xpath('td/text()').extract()
+            result = Result({
+                'attribute': name,
+                'value': tds[0] + ' ' + unit,
+                'source': 'NIST',
+                'reliability': 'Unknown',
+                'conditions': '%s K, (%s -> %s)' % (tds[1], tds[2], tds[3])
+            })
+            log.msg('NIST: |%s|' % result, level=log.DEBUG)
+            results.append(result)
+
+
+        return results
+
     def new_compound_request(self, compound):
         return Request(url=self.website[:-1] + self.search % compound,
                        callback=self.parse)

From 7abb491d3fdcd92661b9c7dc8a7ebfbd686e4cbb Mon Sep 17 00:00:00 2001
From: RTB <robtberge@gmail.com>
Date: Wed, 7 May 2014 22:08:43 +0200
Subject: [PATCH 09/39] added function for most generic tables

---
 FourmiCrawler/sources/NIST.py | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/FourmiCrawler/sources/NIST.py b/FourmiCrawler/sources/NIST.py
index 0191fee..5757546 100644
--- a/FourmiCrawler/sources/NIST.py
+++ b/FourmiCrawler/sources/NIST.py
@@ -46,9 +46,13 @@ class NIST(Source):
             elif len(tables.xpath('tr[1]/th')) == 5:
                 log.msg('NIST table: generic 5 columns', level=log.DEBUG)
                 # Symbol (unit) Temperature (K) Method Reference Comment
+                requests.extend(
+                    self.parse_generic_data(tables))
             elif len(tables.xpath('tr[1]/th')) == 4:
                 log.msg('NIST table: generic 4 columns', level=log.DEBUG)
                 # Symbol (unit) Temperature (K) Reference Comment
+                requests.extend(
+                    self.parse_generic_data(tables))
             else:
                 log.msg('NIST table: NOT SUPPORTED', level=log.WARNING)
                 continue #Assume unsupported
@@ -94,6 +98,26 @@ class NIST(Source):
 
         return results
 
+    @staticmethod
+    def parse_generic_data(table):
+        results = []
+
+        name = table.xpath('@summary').extract()[0]
+        unit = table.xpath('tr[1]/th[1]/node()').extract()[-1][2:-1]
+
+        for tr in table.xpath('tr[td]'):
+            tds = tr.xpath('td/text()').extract()
+            result = Result({
+                'attribute': name,
+                'value': tds[0] + ' ' + unit,
+                'source': 'NIST',
+                'reliability': 'Unknown',
+                'conditions': '%s K' % tds[1]
+            })
+            log.msg('NIST: |%s|' % result, level=log.DEBUG)
+            results.append(result)
+        return results
+
     def new_compound_request(self, compound):
         return Request(url=self.website[:-1] + self.search % compound,
                        callback=self.parse)

From 151f1988a16d4fb58134320106b7712105083681 Mon Sep 17 00:00:00 2001
From: Rob tB <r.tenberge@student.ru.nl>
Date: Thu, 8 May 2014 14:54:30 +0200
Subject: [PATCH 10/39] added function to scrape table for Antoine equation
 parameters

---
 FourmiCrawler/sources/NIST.py | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/FourmiCrawler/sources/NIST.py b/FourmiCrawler/sources/NIST.py
index 5757546..aba0ec2 100644
--- a/FourmiCrawler/sources/NIST.py
+++ b/FourmiCrawler/sources/NIST.py
@@ -43,6 +43,8 @@ class NIST(Source):
                     'Antoine Equation Parameters'):
                 log.msg('NIST table: Antoine Equation Parameters',
                         level=log.DEBUG)
+                requests.extend(
+                    self.parse_antoine_data(tables))
             elif len(tables.xpath('tr[1]/th')) == 5:
                 log.msg('NIST table: generic 5 columns', level=log.DEBUG)
                 # Symbol (unit) Temperature (K) Method Reference Comment
@@ -118,6 +120,25 @@ class NIST(Source):
             results.append(result)
         return results
 
+    @staticmethod
+    def parse_antoine_data(table):
+        results = []
+
+        name = table.xpath('@summary').extract()[0]
+
+        for tr in table.xpath('tr[td]'):
+            tds = tr.xpath('td/text()').extract()
+            result = Result({
+                'attribute': name,
+                'value': 'A=%s, B=%s, C=%s' % (tds[1], tds[2], tds[3]),
+                'source': 'NIST',
+                'reliability': 'Unknown',
+                'conditions': '%s K' % tds[0]
+            })
+            results.append(result)
+
+        return results
+
     def new_compound_request(self, compound):
         return Request(url=self.website[:-1] + self.search % compound,
                        callback=self.parse)

From f6fa5e8adf283810ccb12389642015a37df44431 Mon Sep 17 00:00:00 2001
From: Rob tB <r.tenberge@student.ru.nl>
Date: Thu, 8 May 2014 15:22:48 +0200
Subject: [PATCH 11/39] fixed scraping of unit for two kinds of tables

---
 FourmiCrawler/sources/NIST.py | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/FourmiCrawler/sources/NIST.py b/FourmiCrawler/sources/NIST.py
index aba0ec2..0ce3a28 100644
--- a/FourmiCrawler/sources/NIST.py
+++ b/FourmiCrawler/sources/NIST.py
@@ -83,7 +83,11 @@ class NIST(Source):
         results = []
 
         name = table.xpath('@summary').extract()[0]
-        unit = table.xpath('tr[1]/th[1]/node()').extract()[-1][2:-1]
+        tr_unit = ''.join(table.xpath('tr[1]/th[1]/node()').extract())
+        m = re.search(r'\((.*)\)', tr_unit)
+        unit = '!'
+        if m:
+            unit = m.group(1)
 
         for tr in table.xpath('tr[td]'):
             tds = tr.xpath('td/text()').extract()
@@ -105,7 +109,11 @@ class NIST(Source):
         results = []
 
         name = table.xpath('@summary').extract()[0]
-        unit = table.xpath('tr[1]/th[1]/node()').extract()[-1][2:-1]
+        tr_unit = ''.join(table.xpath('tr[1]/th[1]/node()').extract())
+        m = re.search(r'\((.*)\)', tr_unit)
+        unit = '!'
+        if m:
+            unit = m.group(1)
 
         for tr in table.xpath('tr[td]'):
             tds = tr.xpath('td/text()').extract()

From 74dddace883e84b8bad787751ccf961f7125eca4 Mon Sep 17 00:00:00 2001
From: Rob tB <r.tenberge@student.ru.nl>
Date: Thu, 8 May 2014 15:42:53 +0200
Subject: [PATCH 12/39] removed logging of Result objects in debug messages
 because pointless

---
 FourmiCrawler/sources/NIST.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/FourmiCrawler/sources/NIST.py b/FourmiCrawler/sources/NIST.py
index 0ce3a28..635a61b 100644
--- a/FourmiCrawler/sources/NIST.py
+++ b/FourmiCrawler/sources/NIST.py
@@ -98,7 +98,6 @@ class NIST(Source):
                 'reliability': 'Unknown',
                 'conditions': '%s K, (%s -> %s)' % (tds[1], tds[2], tds[3])
             })
-            log.msg('NIST: |%s|' % result, level=log.DEBUG)
             results.append(result)
 
 
@@ -124,7 +123,6 @@ class NIST(Source):
                 'reliability': 'Unknown',
                 'conditions': '%s K' % tds[1]
             })
-            log.msg('NIST: |%s|' % result, level=log.DEBUG)
             results.append(result)
         return results
 

From 5e067fd57297769b63dc102352f858024233fec3 Mon Sep 17 00:00:00 2001
From: RTB <robtberge@gmail.com>
Date: Fri, 9 May 2014 12:36:54 +0200
Subject: [PATCH 13/39] altered scraping of aggregate data to test for and
 request url to individual data points

---
 FourmiCrawler/sources/NIST.py | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/FourmiCrawler/sources/NIST.py b/FourmiCrawler/sources/NIST.py
index 635a61b..1222c63 100644
--- a/FourmiCrawler/sources/NIST.py
+++ b/FourmiCrawler/sources/NIST.py
@@ -60,10 +60,16 @@ class NIST(Source):
                 continue #Assume unsupported
         return requests
 
-    @staticmethod
-    def parse_aggregate_data(table, symbol_table):
+    def parse_aggregate_data(self, table, symbol_table):
         results = []
         for tr in table.xpath('tr[td]'):
+            extra_data_url = tr.xpath('td[last()][a="Individual data points"]'
+                                '/a/@href').extract()
+            if extra_data_url:
+                request = Request(url=self.website[:-1] + extra_data_url[0],
+                    callback=self.parse_individual_datapoints)
+                results.append(request)
+                continue
             data = []
             for td in tr.xpath('td'):
                 data.append(''.join(td.xpath('node()').extract()))
@@ -145,6 +151,9 @@ class NIST(Source):
 
         return results
 
+    def parse_individual_datapoints(self, response):
+        pass
+
     def new_compound_request(self, compound):
         return Request(url=self.website[:-1] + self.search % compound,
                        callback=self.parse)

From 775a920b9bc72f2d6f7e08624a0203af4f0b0a22 Mon Sep 17 00:00:00 2001
From: RTB <robtberge@gmail.com>
Date: Fri, 9 May 2014 13:00:22 +0200
Subject: [PATCH 14/39] NIST scraper now handles urls with individual data
 points

---
 FourmiCrawler/sources/NIST.py | 25 ++++++++++++++++++++++++-
 1 file changed, 24 insertions(+), 1 deletion(-)

diff --git a/FourmiCrawler/sources/NIST.py b/FourmiCrawler/sources/NIST.py
index 1222c63..6ae6862 100644
--- a/FourmiCrawler/sources/NIST.py
+++ b/FourmiCrawler/sources/NIST.py
@@ -152,7 +152,30 @@ class NIST(Source):
         return results
 
     def parse_individual_datapoints(self, response):
-        pass
+        sel = Selector(response)
+        table = sel.xpath('//table[@class="data"]')[0]
+
+        results = []
+
+        name = table.xpath('@summary').extract()[0]
+        tr_unit = ''.join(table.xpath('tr[1]/th[1]/node()').extract())
+        m = re.search(r'\((.*)\)', tr_unit)
+        unit = '!'
+        if m:
+            unit = m.group(1)
+
+        for tr in table.xpath('tr[td]'):
+            tds = tr.xpath('td/text()').extract()
+            result = Result({
+                'attribute': name,
+                'value': '%s %s' % (tds[0], unit),
+                'source': 'NIST',
+                'reliability': 'Unknown',
+                'conditions': ''
+            })
+            results.append(result)
+
+        return results
 
     def new_compound_request(self, compound):
         return Request(url=self.website[:-1] + self.search % compound,

From 7e984b60d8a8d42d832a160554e356ab833a5419 Mon Sep 17 00:00:00 2001
From: RTB <robtberge@gmail.com>
Date: Fri, 9 May 2014 14:24:08 +0200
Subject: [PATCH 15/39] added uncertainty to results from scraping individual
 data points urls

---
 FourmiCrawler/sources/NIST.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/FourmiCrawler/sources/NIST.py b/FourmiCrawler/sources/NIST.py
index 6ae6862..2d3c672 100644
--- a/FourmiCrawler/sources/NIST.py
+++ b/FourmiCrawler/sources/NIST.py
@@ -166,9 +166,14 @@ class NIST(Source):
 
         for tr in table.xpath('tr[td]'):
             tds = tr.xpath('td/text()').extract()
+            uncertainty = ''
+            m = re.search('Uncertainty assigned by TRC =  (.*?) ', tds[-1])
+            if m:
+                uncertainty = '+- %s ' % m.group(1)
+                # [TODO]: get the plusminus sign working in here
             result = Result({
                 'attribute': name,
-                'value': '%s %s' % (tds[0], unit),
+                'value': '%s %s%s' % (tds[0], uncertainty, unit),
                 'source': 'NIST',
                 'reliability': 'Unknown',
                 'conditions': ''

From 50c79e3b1f7357b9100b3aefa097a036696ace2f Mon Sep 17 00:00:00 2001
From: Rob tB <r.tenberge@student.ru.nl>
Date: Wed, 14 May 2014 13:44:43 +0200
Subject: [PATCH 16/39] conditions in name (split by ' at ') are now moved to
 condition field for individual value page and aggregate data table

---
 FourmiCrawler/sources/NIST.py | 21 ++++++++++++++++++---
 1 file changed, 18 insertions(+), 3 deletions(-)

diff --git a/FourmiCrawler/sources/NIST.py b/FourmiCrawler/sources/NIST.py
index 2d3c672..5a9b544 100644
--- a/FourmiCrawler/sources/NIST.py
+++ b/FourmiCrawler/sources/NIST.py
@@ -73,12 +73,21 @@ class NIST(Source):
             data = []
             for td in tr.xpath('td'):
                 data.append(''.join(td.xpath('node()').extract()))
+
+            name = symbol_table[data[0]]
+            condition = ''
+
+            m = re.match(r'(.*) at (.*)', name)
+            if m:
+                name = m.group(1)
+                condition = m.group(2)
+
             result = Result({
-                'attribute': symbol_table[data[0]],
+                'attribute': name,
                 'value': data[1] + ' ' + data[2],
                 'source': 'NIST',
                 'reliability': 'Unknown',
-                'conditions': ''
+                'conditions': condition
             })
             log.msg('NIST: |%s|' % data, level=log.DEBUG)
             results.append(result)
@@ -158,6 +167,12 @@ class NIST(Source):
         results = []
 
         name = table.xpath('@summary').extract()[0]
+        condition = ''
+        m = re.match(r'(.*) at (.*)', name)
+        if m:
+            name = m.group(1)
+            condition = m.group(2)
+
         tr_unit = ''.join(table.xpath('tr[1]/th[1]/node()').extract())
         m = re.search(r'\((.*)\)', tr_unit)
         unit = '!'
@@ -176,7 +191,7 @@ class NIST(Source):
                 'value': '%s %s%s' % (tds[0], uncertainty, unit),
                 'source': 'NIST',
                 'reliability': 'Unknown',
-                'conditions': ''
+                'conditions': condition
             })
             results.append(result)
 

From 98f58ea4e26f6ef8dfbd44467b3d47ebee64d283 Mon Sep 17 00:00:00 2001
From: Rob tB <r.tenberge@student.ru.nl>
Date: Thu, 15 May 2014 14:29:28 +0200
Subject: [PATCH 17/39] added scraping for generic info except for synonyms

---
 FourmiCrawler/sources/NIST.py | 37 +++++++++++++++++++++++++++++++++++
 1 file changed, 37 insertions(+)

diff --git a/FourmiCrawler/sources/NIST.py b/FourmiCrawler/sources/NIST.py
index 5a9b544..ddb7a09 100644
--- a/FourmiCrawler/sources/NIST.py
+++ b/FourmiCrawler/sources/NIST.py
@@ -18,6 +18,8 @@ class NIST(Source):
 
         requests = []
 
+        requests.extend(self.parse_generic_info(sel))
+
         symbol_table = {}
         tds = sel.xpath('//table[@class="symbol_table"]/tr/td')
         for (symbol_td, name_td) in zip(tds[::2], tds[1::2]):
@@ -60,6 +62,41 @@ class NIST(Source):
                 continue #Assume unsupported
         return requests
 
+    def parse_generic_info(self, sel):
+        ul = sel.xpath('body/ul[li/strong="IUPAC Standard InChI:"]')
+        li = ul.xpath('li')
+
+        data = {}
+
+        raw_formula = ul.xpath('li[strong/a="Formula"]//text()').extract()
+        data['Chemical formula'] = ''.join(raw_formula[2:]).strip()
+
+        raw_mol_weight = ul.xpath('li[strong/a="Molecular weight"]/text()')
+        data['Molecular weight'] = raw_mol_weight.extract()[0].strip()
+
+        raw_inchi = ul.xpath('li[strong="IUPAC Standard InChI:"]//tt/text()')
+        data['IUPAC Standard InChI'] = raw_inchi.extract()[0]
+
+        raw_inchikey = ul.xpath('li[strong="IUPAC Standard InChIKey:"]'
+                            '/tt/text()')
+        data['IUPAC Standard InChIKey'] = raw_inchikey.extract()[0]
+
+        raw_cas_number = ul.xpath('li[strong="CAS Registry Number:"]/text()')
+        data['CAS Registry Number'] = raw_cas_number.extract()[0].strip()
+
+        requests = []
+        for key, value in data.iteritems():
+            result = Result({
+                'attribute': key,
+                'value': value,
+                'source': 'NIST',
+                'reliability': 'Unknown',
+                'conditions': ''
+            })
+            requests.append(result)
+
+        return requests
+
     def parse_aggregate_data(self, table, symbol_table):
         results = []
         for tr in table.xpath('tr[td]'):

From 56ee6b1ad347c475ef24f077377c35577b4fbfc5 Mon Sep 17 00:00:00 2001
From: RTB <robtberge@gmail.com>
Date: Sat, 17 May 2014 14:09:10 +0200
Subject: [PATCH 18/39] added ignore list

---
 FourmiCrawler/sources/NIST.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/FourmiCrawler/sources/NIST.py b/FourmiCrawler/sources/NIST.py
index ddb7a09..17552e5 100644
--- a/FourmiCrawler/sources/NIST.py
+++ b/FourmiCrawler/sources/NIST.py
@@ -5,11 +5,15 @@ from scrapy.selector import Selector
 from FourmiCrawler.items import Result
 import re
 
+# [TODO]: values can be '128.', perhaps remove the dot in that case?
+
 class NIST(Source):
     website = "http://webbook.nist.gov/*"  
 
     search = 'cgi/cbook.cgi?Name=%s&Units=SI&cTP=on'
 
+    ignore_list = set()
+
     def __init__(self):
         Source.__init__(self)
 
@@ -235,5 +239,7 @@ class NIST(Source):
         return results
 
     def new_compound_request(self, compound):
-        return Request(url=self.website[:-1] + self.search % compound,
-                       callback=self.parse)
+        if compound not in self.ignore_list:
+            self.ignore_list.update(compound)
+            return Request(url=self.website[:-1] + self.search % compound,
+                           callback=self.parse)

From afc1106838120503bd4cde2af80781e3e2738c9d Mon Sep 17 00:00:00 2001
From: RTB <robtberge@gmail.com>
Date: Sat, 17 May 2014 14:11:03 +0200
Subject: [PATCH 19/39] NIST now logs an error if chemical name is not found

---
 FourmiCrawler/sources/NIST.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/FourmiCrawler/sources/NIST.py b/FourmiCrawler/sources/NIST.py
index 17552e5..d5eaa76 100644
--- a/FourmiCrawler/sources/NIST.py
+++ b/FourmiCrawler/sources/NIST.py
@@ -20,6 +20,11 @@ class NIST(Source):
     def parse(self, response):
         sel = Selector(response)
 
+        title = sel.xpath('head/title/text()').extract()[0]
+        if title == 'Name Not Found':
+            log.msg('NIST: Chemical not found!', level=log.ERROR)
+            return
+
         requests = []
 
         requests.extend(self.parse_generic_info(sel))

From b46c7a309d8132efbed111ba13d4d52fa122d70f Mon Sep 17 00:00:00 2001
From: RTB <robtberge@gmail.com>
Date: Sat, 17 May 2014 14:21:11 +0200
Subject: [PATCH 20/39] if synonym name matched in search instead of primary
 name, emit primary name as synonym

---
 FourmiCrawler/sources/NIST.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/FourmiCrawler/sources/NIST.py b/FourmiCrawler/sources/NIST.py
index d5eaa76..a969384 100644
--- a/FourmiCrawler/sources/NIST.py
+++ b/FourmiCrawler/sources/NIST.py
@@ -24,6 +24,10 @@ class NIST(Source):
         if title == 'Name Not Found':
             log.msg('NIST: Chemical not found!', level=log.ERROR)
             return
+        if title not in self.ignore_list:
+            self.ignore_list.update(title)
+            log.msg('NIST emit synonym: %s' % title, level=log.DEBUG)
+            self._spider.get_synonym_requests(title)
 
         requests = []
 

From 472aae86be443c8372c356b0509e13cabd3b1c9d Mon Sep 17 00:00:00 2001
From: RTB <robtberge@gmail.com>
Date: Sat, 17 May 2014 19:32:20 +0200
Subject: [PATCH 21/39] synonyms are now scraped

---
 FourmiCrawler/sources/NIST.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/FourmiCrawler/sources/NIST.py b/FourmiCrawler/sources/NIST.py
index a969384..4bb8e30 100644
--- a/FourmiCrawler/sources/NIST.py
+++ b/FourmiCrawler/sources/NIST.py
@@ -79,6 +79,12 @@ class NIST(Source):
         ul = sel.xpath('body/ul[li/strong="IUPAC Standard InChI:"]')
         li = ul.xpath('li')
 
+        raw_synonyms = ul.xpath('li[strong="Other names:"]/text()').extract()
+        for synonym in raw_synonyms[0].strip().split(';\n'):
+            log.msg('NIST synonym: %s' % synonym, level=log.DEBUG)
+            self.ignore_list.update(synonym)
+            self._spider.get_synonym_requests(synonym)
+
         data = {}
 
         raw_formula = ul.xpath('li[strong/a="Formula"]//text()').extract()

From 81719a38fbb72a4344cd25c49c7361e8907779e6 Mon Sep 17 00:00:00 2001
From: RTB <robtberge@gmail.com>
Date: Tue, 20 May 2014 19:32:06 +0200
Subject: [PATCH 22/39] Added comments for the class and functions

---
 FourmiCrawler/sources/NIST.py | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/FourmiCrawler/sources/NIST.py b/FourmiCrawler/sources/NIST.py
index 4bb8e30..a2fb425 100644
--- a/FourmiCrawler/sources/NIST.py
+++ b/FourmiCrawler/sources/NIST.py
@@ -6,8 +6,15 @@ from FourmiCrawler.items import Result
 import re
 
 # [TODO]: values can be '128.', perhaps remove the dot in that case?
+# [TODO]: properties have references and comments which do not exist in the
+#         Result item, but should be included eventually.
 
 class NIST(Source):
+    """NIST Scraper plugin
+
+    This plugin manages searching for a chemical on the NIST website
+    and parsing the resulting page if the chemical exists on NIST.
+    """
     website = "http://webbook.nist.gov/*"  
 
     search = 'cgi/cbook.cgi?Name=%s&Units=SI&cTP=on'
@@ -76,6 +83,9 @@ class NIST(Source):
         return requests
 
     def parse_generic_info(self, sel):
+        """Parses: synonyms, chemical formula, molecular weight, InChI,
+        InChiKey, CAS number
+        """
         ul = sel.xpath('body/ul[li/strong="IUPAC Standard InChI:"]')
         li = ul.xpath('li')
 
@@ -117,6 +127,9 @@ class NIST(Source):
         return requests
 
     def parse_aggregate_data(self, table, symbol_table):
+        """Parses the table(s) which contain possible links to individual
+        data points
+        """
         results = []
         for tr in table.xpath('tr[td]'):
             extra_data_url = tr.xpath('td[last()][a="Individual data points"]'
@@ -151,6 +164,7 @@ class NIST(Source):
 
     @staticmethod
     def parse_transition_data(table, symbol_table):
+        """Parses the table containing properties regarding phase changes"""
         results = []
 
         name = table.xpath('@summary').extract()[0]
@@ -176,6 +190,11 @@ class NIST(Source):
 
     @staticmethod
     def parse_generic_data(table):
+        """Parses the common tables of 4 and 5 rows. Assumes they are of the
+        form:
+        Symbol (unit)|Temperature (K)|Method|Reference|Comment
+        Symbol (unit)|Temperature (K)|Reference|Comment
+        """
         results = []
 
         name = table.xpath('@summary').extract()[0]
@@ -199,6 +218,7 @@ class NIST(Source):
 
     @staticmethod
     def parse_antoine_data(table):
+        """Parse table containing parameters for the Antione equation"""
         results = []
 
         name = table.xpath('@summary').extract()[0]
@@ -217,6 +237,7 @@ class NIST(Source):
         return results
 
     def parse_individual_datapoints(self, response):
+        """Parses the page linked from aggregate data"""
         sel = Selector(response)
         table = sel.xpath('//table[@class="data"]')[0]
 

From 95565042cae6ba0eec9a58ff4308fa6e18a28765 Mon Sep 17 00:00:00 2001
From: Rob tB <r.tenberge@student.ru.nl>
Date: Wed, 21 May 2014 10:22:03 +0200
Subject: [PATCH 23/39] removed unused variable symbol_table from
 parse_transition_table

---
 FourmiCrawler/sources/NIST.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/FourmiCrawler/sources/NIST.py b/FourmiCrawler/sources/NIST.py
index a2fb425..7770efc 100644
--- a/FourmiCrawler/sources/NIST.py
+++ b/FourmiCrawler/sources/NIST.py
@@ -57,8 +57,7 @@ class NIST(Source):
             elif tables.xpath('tr/th="Initial Phase"').extract()[0] == '1':
                 log.msg('NIST table; Enthalpy/entropy of phase transition',
                         level=log.DEBUG)
-                requests.extend(
-                    self.parse_transition_data(tables, symbol_table))
+                requests.extend(self.parse_transition_data(tables))
             elif tables.xpath('tr[1]/td'):
                 log.msg('NIST table: Horizontal table', level=log.DEBUG)
             elif (tables.xpath('@summary').extract()[0] ==
@@ -163,7 +162,7 @@ class NIST(Source):
         return results
 
     @staticmethod
-    def parse_transition_data(table, symbol_table):
+    def parse_transition_data(table):
         """Parses the table containing properties regarding phase changes"""
         results = []
 

From 429ffd74221ff2262dde1ab1113bbbfaffe23001 Mon Sep 17 00:00:00 2001
From: Rob tB <r.tenberge@student.ru.nl>
Date: Wed, 21 May 2014 10:28:54 +0200
Subject: [PATCH 24/39] renamed tables to table in parse()

---
 FourmiCrawler/sources/NIST.py | 27 ++++++++++++---------------
 1 file changed, 12 insertions(+), 15 deletions(-)

diff --git a/FourmiCrawler/sources/NIST.py b/FourmiCrawler/sources/NIST.py
index 7770efc..d04b0d0 100644
--- a/FourmiCrawler/sources/NIST.py
+++ b/FourmiCrawler/sources/NIST.py
@@ -49,33 +49,30 @@ class NIST(Source):
             log.msg('NIST symbol: |%s|, name: |%s|' % (symbol, name),
                     level=log.DEBUG)
 
-        for tables in sel.xpath('//table[@class="data"]'):
-            if tables.xpath('@summary').extract()[0] == 'One dimensional data':
+        for table in sel.xpath('//table[@class="data"]'):
+            if table.xpath('@summary').extract()[0] == 'One dimensional data':
                 log.msg('NIST table: Aggregrate data', level=log.DEBUG)
                 requests.extend(
-                    self.parse_aggregate_data(tables, symbol_table))
-            elif tables.xpath('tr/th="Initial Phase"').extract()[0] == '1':
+                    self.parse_aggregate_data(table, symbol_table))
+            elif table.xpath('tr/th="Initial Phase"').extract()[0] == '1':
                 log.msg('NIST table; Enthalpy/entropy of phase transition',
                         level=log.DEBUG)
-                requests.extend(self.parse_transition_data(tables))
-            elif tables.xpath('tr[1]/td'):
+                requests.extend(self.parse_transition_data(table))
+            elif table.xpath('tr[1]/td'):
                 log.msg('NIST table: Horizontal table', level=log.DEBUG)
-            elif (tables.xpath('@summary').extract()[0] ==
+            elif (table.xpath('@summary').extract()[0] ==
                     'Antoine Equation Parameters'):
                 log.msg('NIST table: Antoine Equation Parameters',
                         level=log.DEBUG)
-                requests.extend(
-                    self.parse_antoine_data(tables))
-            elif len(tables.xpath('tr[1]/th')) == 5:
+                requests.extend(self.parse_antoine_data(table))
+            elif len(table.xpath('tr[1]/th')) == 5:
                 log.msg('NIST table: generic 5 columns', level=log.DEBUG)
                 # Symbol (unit) Temperature (K) Method Reference Comment
-                requests.extend(
-                    self.parse_generic_data(tables))
-            elif len(tables.xpath('tr[1]/th')) == 4:
+                requests.extend(self.parse_generic_data(table))
+            elif len(table.xpath('tr[1]/th')) == 4:
                 log.msg('NIST table: generic 4 columns', level=log.DEBUG)
                 # Symbol (unit) Temperature (K) Reference Comment
-                requests.extend(
-                    self.parse_generic_data(tables))
+                requests.extend(self.parse_generic_data(table))
             else:
                 log.msg('NIST table: NOT SUPPORTED', level=log.WARNING)
                 continue #Assume unsupported

From c0af24644b922eaa3223b7889ae094cd83290e1c Mon Sep 17 00:00:00 2001
From: Rob tB <r.tenberge@student.ru.nl>
Date: Wed, 21 May 2014 10:31:19 +0200
Subject: [PATCH 25/39] added summary variable in parse()

---
 FourmiCrawler/sources/NIST.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/FourmiCrawler/sources/NIST.py b/FourmiCrawler/sources/NIST.py
index d04b0d0..cadf6dc 100644
--- a/FourmiCrawler/sources/NIST.py
+++ b/FourmiCrawler/sources/NIST.py
@@ -50,7 +50,8 @@ class NIST(Source):
                     level=log.DEBUG)
 
         for table in sel.xpath('//table[@class="data"]'):
-            if table.xpath('@summary').extract()[0] == 'One dimensional data':
+            summary = table.xpath('@summary').extract()[0]
+            if summary == 'One dimensional data':
                 log.msg('NIST table: Aggregrate data', level=log.DEBUG)
                 requests.extend(
                     self.parse_aggregate_data(table, symbol_table))
@@ -60,8 +61,7 @@ class NIST(Source):
                 requests.extend(self.parse_transition_data(table))
             elif table.xpath('tr[1]/td'):
                 log.msg('NIST table: Horizontal table', level=log.DEBUG)
-            elif (table.xpath('@summary').extract()[0] ==
-                    'Antoine Equation Parameters'):
+            elif summary == 'Antoine Equation Parameters'):
                 log.msg('NIST table: Antoine Equation Parameters',
                         level=log.DEBUG)
                 requests.extend(self.parse_antoine_data(table))

From 6cd8edaf222f2bdfdc97ab59de914cc65d05369e Mon Sep 17 00:00:00 2001
From: Rob tB <r.tenberge@student.ru.nl>
Date: Wed, 21 May 2014 10:36:42 +0200
Subject: [PATCH 26/39] included summary variable in call to transition_table,
 antoine table and generic table

---
 FourmiCrawler/sources/NIST.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/FourmiCrawler/sources/NIST.py b/FourmiCrawler/sources/NIST.py
index cadf6dc..76caca3 100644
--- a/FourmiCrawler/sources/NIST.py
+++ b/FourmiCrawler/sources/NIST.py
@@ -58,21 +58,21 @@ class NIST(Source):
             elif table.xpath('tr/th="Initial Phase"').extract()[0] == '1':
                 log.msg('NIST table; Enthalpy/entropy of phase transition',
                         level=log.DEBUG)
-                requests.extend(self.parse_transition_data(table))
+                requests.extend(self.parse_transition_data(table, summary))
             elif table.xpath('tr[1]/td'):
                 log.msg('NIST table: Horizontal table', level=log.DEBUG)
-            elif summary == 'Antoine Equation Parameters'):
+            elif summary == 'Antoine Equation Parameters':
                 log.msg('NIST table: Antoine Equation Parameters',
                         level=log.DEBUG)
-                requests.extend(self.parse_antoine_data(table))
+                requests.extend(self.parse_antoine_data(table, summary))
             elif len(table.xpath('tr[1]/th')) == 5:
                 log.msg('NIST table: generic 5 columns', level=log.DEBUG)
                 # Symbol (unit) Temperature (K) Method Reference Comment
-                requests.extend(self.parse_generic_data(table))
+                requests.extend(self.parse_generic_data(table, summary))
             elif len(table.xpath('tr[1]/th')) == 4:
                 log.msg('NIST table: generic 4 columns', level=log.DEBUG)
                 # Symbol (unit) Temperature (K) Reference Comment
-                requests.extend(self.parse_generic_data(table))
+                requests.extend(self.parse_generic_data(table, summary))
             else:
                 log.msg('NIST table: NOT SUPPORTED', level=log.WARNING)
                 continue #Assume unsupported
@@ -159,7 +159,7 @@ class NIST(Source):
         return results
 
     @staticmethod
-    def parse_transition_data(table):
+    def parse_transition_data(table, summary):
         """Parses the table containing properties regarding phase changes"""
         results = []
 
@@ -185,7 +185,7 @@ class NIST(Source):
         return results
 
     @staticmethod
-    def parse_generic_data(table):
+    def parse_generic_data(table, summary):
         """Parses the common tables of 4 and 5 rows. Assumes they are of the
         form:
         Symbol (unit)|Temperature (K)|Method|Reference|Comment
@@ -213,7 +213,7 @@ class NIST(Source):
         return results
 
     @staticmethod
-    def parse_antoine_data(table):
+    def parse_antoine_data(table, summary):
         """Parse table containing parameters for the Antione equation"""
         results = []
 

From 6ce5ff23359786ae24274fcac3e11a7934a453e6 Mon Sep 17 00:00:00 2001
From: Rob tB <r.tenberge@student.ru.nl>
Date: Wed, 21 May 2014 10:40:44 +0200
Subject: [PATCH 27/39] replaced name variable with summary variable

---
 FourmiCrawler/sources/NIST.py | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/FourmiCrawler/sources/NIST.py b/FourmiCrawler/sources/NIST.py
index 76caca3..0b75b17 100644
--- a/FourmiCrawler/sources/NIST.py
+++ b/FourmiCrawler/sources/NIST.py
@@ -163,7 +163,6 @@ class NIST(Source):
         """Parses the table containing properties regarding phase changes"""
         results = []
 
-        name = table.xpath('@summary').extract()[0]
         tr_unit = ''.join(table.xpath('tr[1]/th[1]/node()').extract())
         m = re.search(r'\((.*)\)', tr_unit)
         unit = '!'
@@ -173,7 +172,7 @@ class NIST(Source):
         for tr in table.xpath('tr[td]'):
             tds = tr.xpath('td/text()').extract()
             result = Result({
-                'attribute': name,
+                'attribute': summary,
                 'value': tds[0] + ' ' + unit,
                 'source': 'NIST',
                 'reliability': 'Unknown',
@@ -193,7 +192,6 @@ class NIST(Source):
         """
         results = []
 
-        name = table.xpath('@summary').extract()[0]
         tr_unit = ''.join(table.xpath('tr[1]/th[1]/node()').extract())
         m = re.search(r'\((.*)\)', tr_unit)
         unit = '!'
@@ -203,7 +201,7 @@ class NIST(Source):
         for tr in table.xpath('tr[td]'):
             tds = tr.xpath('td/text()').extract()
             result = Result({
-                'attribute': name,
+                'attribute': summary,
                 'value': tds[0] + ' ' + unit,
                 'source': 'NIST',
                 'reliability': 'Unknown',
@@ -217,12 +215,10 @@ class NIST(Source):
         """Parse table containing parameters for the Antione equation"""
         results = []
 
-        name = table.xpath('@summary').extract()[0]
-
         for tr in table.xpath('tr[td]'):
             tds = tr.xpath('td/text()').extract()
             result = Result({
-                'attribute': name,
+                'attribute': summary,
                 'value': 'A=%s, B=%s, C=%s' % (tds[1], tds[2], tds[3]),
                 'source': 'NIST',
                 'reliability': 'Unknown',

From 98f91a1aa9b8b0fdce5a6c9de903695a46d0dcc9 Mon Sep 17 00:00:00 2001
From: "Jip J. Dekker" <jip@dekker.li>
Date: Thu, 22 May 2014 12:15:43 +0200
Subject: [PATCH 28/39] Added a pipeline to replace None values with empty
 strings

---
 FourmiCrawler/pipelines.py | 16 ++++++++++++++++
 FourmiCrawler/settings.py  |  5 +++--
 2 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/FourmiCrawler/pipelines.py b/FourmiCrawler/pipelines.py
index e1dadbf..2c775f2 100644
--- a/FourmiCrawler/pipelines.py
+++ b/FourmiCrawler/pipelines.py
@@ -5,6 +5,22 @@
 import re
 from scrapy.exceptions import DropItem
 
+class RemoveNonePipeline(object):
+
+    def __init__(self):
+        self.known_values = set()
+
+    def process_item(self, item, spider):
+        """
+        Processing the items so None values are replaced by empty strings
+        :param item: The incoming item
+        :param spider: The spider which scraped the spider
+        :return: :raise DropItem: Returns the item if unique or drops them if it's already known
+        """
+        for key in item:
+            if item[key] is None:
+                item[key] = ""
+        return item
 
 class DuplicatePipeline(object):
 
diff --git a/FourmiCrawler/settings.py b/FourmiCrawler/settings.py
index d7ac212..be7c451 100644
--- a/FourmiCrawler/settings.py
+++ b/FourmiCrawler/settings.py
@@ -11,8 +11,9 @@ BOT_NAME = 'FourmiCrawler'
 SPIDER_MODULES = ['FourmiCrawler']
 NEWSPIDER_MODULE = 'FourmiCrawler'
 ITEM_PIPELINES = {
-    'FourmiCrawler.pipelines.AttributeSelectionPipeline': 100,
-    'FourmiCrawler.pipelines.DuplicatePipeline': 200,
+    "FourmiCrawler.pipelines.RemoveNonePipeline": 100,
+    'FourmiCrawler.pipelines.AttributeSelectionPipeline': 200,
+    'FourmiCrawler.pipelines.DuplicatePipeline': 300,
 }
 FEED_URI = 'results.json'
 FEED_FORMAT = 'jsonlines'

From f80a32a0dcb0e5216d3301969e6f360fc1d8cc31 Mon Sep 17 00:00:00 2001
From: "Jip J. Dekker" <jip@dekker.li>
Date: Thu, 22 May 2014 12:19:41 +0200
Subject: [PATCH 29/39] Pushed the version number

---
 fourmi.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fourmi.py b/fourmi.py
index efa4e54..08e010b 100755
--- a/fourmi.py
+++ b/fourmi.py
@@ -80,7 +80,7 @@ def search(docopt_arguments, source_loader):
 
 
 if __name__ == '__main__':
-    arguments = docopt.docopt(__doc__, version='Fourmi - V0.3.0')
+    arguments = docopt.docopt(__doc__, version='Fourmi - V0.3.1')
     loader = SourceLoader()
 
     if arguments["--include"]:

From 7a4696aed4def4bb00f5a06a76f0a3a1249f86f8 Mon Sep 17 00:00:00 2001
From: "Jip J. Dekker" <jip@dekker.li>
Date: Fri, 23 May 2014 13:05:01 +0200
Subject: [PATCH 30/39] Tested and pushed version number

---
 fourmi.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fourmi.py b/fourmi.py
index 08e010b..6d5b8e8 100755
--- a/fourmi.py
+++ b/fourmi.py
@@ -80,7 +80,7 @@ def search(docopt_arguments, source_loader):
 
 
 if __name__ == '__main__':
-    arguments = docopt.docopt(__doc__, version='Fourmi - V0.3.1')
+    arguments = docopt.docopt(__doc__, version='Fourmi - V0.4.0')
     loader = SourceLoader()
 
     if arguments["--include"]:

From ca90796904b8d3ff0b0cd0c00f87b1761ddca7ad Mon Sep 17 00:00:00 2001
From: "Jip J. Dekker" <jip@dekker.li>
Date: Sun, 1 Jun 2014 19:53:37 +0200
Subject: [PATCH 31/39] Added documentation to the Executable Python file

---
 fourmi.py | 24 +++++++++++++++++++++++-
 1 file changed, 23 insertions(+), 1 deletion(-)

diff --git a/fourmi.py b/fourmi.py
index efa4e54..c09087d 100755
--- a/fourmi.py
+++ b/fourmi.py
@@ -34,6 +34,13 @@ from sourceloader import SourceLoader
 
 
 def setup_crawler(searchable, settings, source_loader, attributes):
+    """
+    This function prepares and start the crawler which starts the actual search on the internet
+    :param searchable: The compound which should be searched
+    :param settings: A scrapy settings object
+    :param source_loader: A fully functional SourceLoader object which contains only the sources that should be used.
+    :param attributes: A list of regular expressions which the attribute names should match.
+    """
     spider = FourmiSpider(compound=searchable, selected_attributes=attributes)
     spider.add_parsers(source_loader.sources)
     crawler = Crawler(settings)
@@ -44,8 +51,13 @@ def setup_crawler(searchable, settings, source_loader, attributes):
 
 
 def scrapy_settings_manipulation(docopt_arguments):
+    """
+    This function manipulates the Scrapy settings that normally would be set in the settings file. In the Fourmi
+    project these are command line arguments.
+    :param docopt_arguments: A dictionary generated by docopt containing all CLI arguments.
+    """
     settings = get_project_settings()
-    # [todo] - add at least a warning for files that already exist
+
     if docopt_arguments["--output"] != 'result.*format*':
         settings.overrides["FEED_URI"] = docopt_arguments["--output"]
     elif docopt_arguments["--format"] == "jsonlines":
@@ -60,6 +72,10 @@ def scrapy_settings_manipulation(docopt_arguments):
 
 
 def start_log(docopt_arguments):
+    """
+    This function starts the logging functionality of Scrapy using the settings given by the CLI.
+    :param docopt_arguments:  A dictionary generated by docopt containing all CLI arguments.
+    """
     if docopt_arguments["--log"] is not None:
         if docopt_arguments["--verbose"]:
             log.start(logfile=docopt_arguments["--log"], logstdout=False, loglevel=log.DEBUG)
@@ -73,12 +89,18 @@ def start_log(docopt_arguments):
 
 
 def search(docopt_arguments, source_loader):
+    """
+    The function that facilitates the search for a specific compound.
+    :param docopt_arguments: A dictionary generated by docopt containing all CLI arguments.
+    :param source_loader: An initiated SourceLoader object pointed at the directory with the sources.
+    """
     start_log(docopt_arguments)
     settings = scrapy_settings_manipulation(docopt_arguments)
     setup_crawler(docopt_arguments["<compound>"], settings, source_loader, docopt_arguments["--attributes"].split(','))
     reactor.run()
 
 
+# The start for the Fourmi Command Line interface.
 if __name__ == '__main__':
     arguments = docopt.docopt(__doc__, version='Fourmi - V0.3.0')
     loader = SourceLoader()

From e272c9f3425d42446abd1f428448edc944f22319 Mon Sep 17 00:00:00 2001
From: "Jip J. Dekker" <jip@dekker.li>
Date: Sun, 1 Jun 2014 19:55:10 +0200
Subject: [PATCH 32/39] Changed a parameter name for clarification

---
 fourmi.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/fourmi.py b/fourmi.py
index c09087d..9f32cff 100755
--- a/fourmi.py
+++ b/fourmi.py
@@ -33,15 +33,15 @@ from FourmiCrawler.spider import FourmiSpider
 from sourceloader import SourceLoader
 
 
-def setup_crawler(searchable, settings, source_loader, attributes):
+def setup_crawler(compound, settings, source_loader, attributes):
     """
     This function prepares and start the crawler which starts the actual search on the internet
-    :param searchable: The compound which should be searched
+    :param compound: The compound which should be searched
     :param settings: A scrapy settings object
     :param source_loader: A fully functional SourceLoader object which contains only the sources that should be used.
     :param attributes: A list of regular expressions which the attribute names should match.
     """
-    spider = FourmiSpider(compound=searchable, selected_attributes=attributes)
+    spider = FourmiSpider(compound=compound, selected_attributes=attributes)
     spider.add_parsers(source_loader.sources)
     crawler = Crawler(settings)
     crawler.signals.connect(reactor.stop, signal=signals.spider_closed)

From a040bc7a0263aed473ab1b5ce2f294aeaad81d2b Mon Sep 17 00:00:00 2001
From: "Jip J. Dekker" <jip@dekker.li>
Date: Sun, 1 Jun 2014 20:01:19 +0200
Subject: [PATCH 33/39] Added documentation for the sourceloader

---
 sourceloader.py | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/sourceloader.py b/sourceloader.py
index 9957a70..2ed50a8 100644
--- a/sourceloader.py
+++ b/sourceloader.py
@@ -1,6 +1,7 @@
 import inspect
 import os
 import re
+
 from FourmiCrawler.sources.source import Source
 
 
@@ -8,6 +9,10 @@ class SourceLoader:
     sources = []
 
     def __init__(self, rel_dir="FourmiCrawler/sources"):
+        """
+        The initiation of a SourceLoader, selects and indexes a directory for usable sources.
+        :param rel_dir: A relative path to a directory.
+        """
         path = os.path.dirname(os.path.abspath(__file__))
         path += "/" + rel_dir
         known_parser = set()
@@ -21,18 +26,30 @@ class SourceLoader:
                     known_parser.add(cls)
 
     def include(self, source_names):
+        """
+        This function excludes all sources that don't match the given regular expressions.
+        :param source_names: A list of regular expression (strings)
+        """
         new = set()
         for name in source_names:
             new.update([src for src in self.sources if re.match(name, src.__class__.__name__)])
         self.sources = list(new)
 
     def exclude(self, source_names):
+        """
+        This function excludes all sources that match the given regular expressions.
+        :param source_names: A list of regular expression (strings)
+        """
         exclude = []
         for name in source_names:
             exclude.extend([src for src in self.sources if re.match(name, src.__class__.__name__)])
         self.sources = [src for src in self.sources if src not in exclude]
 
     def __str__(self):
+        """
+        This function returns a string with all sources currently available in the SourceLoader.
+        :return: a string with all available sources.
+        """
         string = ""
         for src in self.sources:
             string += "Source: " + src.__class__.__name__

From c4876f029baa41dd17197f0fb72fc5c466f71d1d Mon Sep 17 00:00:00 2001
From: "Jip J. Dekker" <jip@dekker.li>
Date: Sun, 1 Jun 2014 20:14:47 +0200
Subject: [PATCH 34/39] Added documentation to the FourmiSpider

---
 FourmiCrawler/spider.py | 34 +++++++++++++++++++++++++++++++++-
 1 file changed, 33 insertions(+), 1 deletion(-)

diff --git a/FourmiCrawler/spider.py b/FourmiCrawler/spider.py
index 87f22c6..8ec18cc 100644
--- a/FourmiCrawler/spider.py
+++ b/FourmiCrawler/spider.py
@@ -1,19 +1,34 @@
+import re
+
 from scrapy.spider import Spider
 from scrapy import log
-import re
 
 
 class FourmiSpider(Spider):
+    """
+    A spider writen for the Fourmi Project which calls upon all available sources to request and scrape data.
+    """
     name = "FourmiSpider"
     __parsers = []
     synonyms = []
 
     def __init__(self, compound=None, selected_attributes=[".*"], *args, **kwargs):
+        """
+        Initiation of the Spider
+        :param compound: compound that will be searched.
+        :param selected_attributes: A list of regular expressions that the attributes should match.
+        """
         super(FourmiSpider, self).__init__(*args, **kwargs)
         self.synonyms.append(compound)
         self.selected_attributes = selected_attributes;
 
     def parse(self, reponse):
+        """
+        The function that is called when a response to a request is available. This function distributes this to a
+        parser which should be able to handle parsing the data.
+        :param reponse: A Scrapy Response object that should be parsed
+        :return: A list of Result items and new Request to be handled by the scrapy core.
+        """
         for parser in self.__parsers:
             if re.match(parser.website, reponse.url):
                 log.msg("Url: " + reponse.url + " -> Source: " + parser.website, level=log.DEBUG)
@@ -21,6 +36,11 @@ class FourmiSpider(Spider):
         return None
 
     def get_synonym_requests(self, compound):
+        """
+        A function that generates new Scrapy Request for each source given a new synonym of a compound.
+        :param compound: A compound name
+        :return: A list of Scrapy Request objects
+        """
         requests = []
         for parser in self.__parsers:
             parser_requests = parser.new_compound_request(compound)
@@ -29,15 +49,27 @@ class FourmiSpider(Spider):
         return requests
 
     def start_requests(self):
+        """
+        The function called by Scrapy for it's first Requests
+        :return: A list of Scrapy Request generated from the known synonyms using the available sources.
+        """
         requests = []
         for synonym in self.synonyms:
             requests.extend(self.get_synonym_requests(synonym))
         return requests
 
     def add_parsers(self, parsers):
+        """
+        A function to add a new Parser objects to the list of available parsers.
+        :param parsers: A list of Parser Objects.
+        """
         for parser in parsers:
             self.add_parser(parser)
 
     def add_parser(self, parser):
+        """
+        A function add a new Parser object to the list of available parsers.
+        :param parser: A Parser Object
+        """
         self.__parsers.append(parser)
         parser.set_spider(self)
\ No newline at end of file

From 3499946e97be70b98de89566a30999ba0d1666b8 Mon Sep 17 00:00:00 2001
From: "Jip J. Dekker" <jip@dekker.li>
Date: Sun, 1 Jun 2014 20:15:15 +0200
Subject: [PATCH 35/39] Fixed a typo

---
 FourmiCrawler/spider.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/FourmiCrawler/spider.py b/FourmiCrawler/spider.py
index 8ec18cc..a58b6ea 100644
--- a/FourmiCrawler/spider.py
+++ b/FourmiCrawler/spider.py
@@ -22,17 +22,17 @@ class FourmiSpider(Spider):
         self.synonyms.append(compound)
         self.selected_attributes = selected_attributes;
 
-    def parse(self, reponse):
+    def parse(self, response):
         """
         The function that is called when a response to a request is available. This function distributes this to a
         parser which should be able to handle parsing the data.
-        :param reponse: A Scrapy Response object that should be parsed
+        :param response: A Scrapy Response object that should be parsed
         :return: A list of Result items and new Request to be handled by the scrapy core.
         """
         for parser in self.__parsers:
-            if re.match(parser.website, reponse.url):
-                log.msg("Url: " + reponse.url + " -> Source: " + parser.website, level=log.DEBUG)
-                return parser.parse(reponse)
+            if re.match(parser.website, response.url):
+                log.msg("Url: " + response.url + " -> Source: " + parser.website, level=log.DEBUG)
+                return parser.parse(response)
         return None
 
     def get_synonym_requests(self, compound):

From c27a875d681d0f912570bef4a583b85ea483bdbe Mon Sep 17 00:00:00 2001
From: "Jip J. Dekker" <jip@dekker.li>
Date: Sun, 1 Jun 2014 20:18:03 +0200
Subject: [PATCH 36/39] Parser/Source consistency

---
 FourmiCrawler/spider.py | 32 ++++++++++++++++----------------
 fourmi.py               |  2 +-
 2 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/FourmiCrawler/spider.py b/FourmiCrawler/spider.py
index a58b6ea..08abb6b 100644
--- a/FourmiCrawler/spider.py
+++ b/FourmiCrawler/spider.py
@@ -9,7 +9,7 @@ class FourmiSpider(Spider):
     A spider writen for the Fourmi Project which calls upon all available sources to request and scrape data.
     """
     name = "FourmiSpider"
-    __parsers = []
+    __sources = []
     synonyms = []
 
     def __init__(self, compound=None, selected_attributes=[".*"], *args, **kwargs):
@@ -25,14 +25,14 @@ class FourmiSpider(Spider):
     def parse(self, response):
         """
         The function that is called when a response to a request is available. This function distributes this to a
-        parser which should be able to handle parsing the data.
+        source which should be able to handle parsing the data.
         :param response: A Scrapy Response object that should be parsed
         :return: A list of Result items and new Request to be handled by the scrapy core.
         """
-        for parser in self.__parsers:
-            if re.match(parser.website, response.url):
-                log.msg("Url: " + response.url + " -> Source: " + parser.website, level=log.DEBUG)
-                return parser.parse(response)
+        for source in self.__sources:
+            if re.match(source.website, response.url):
+                log.msg("Url: " + response.url + " -> Source: " + source.website, level=log.DEBUG)
+                return source.parse(response)
         return None
 
     def get_synonym_requests(self, compound):
@@ -42,7 +42,7 @@ class FourmiSpider(Spider):
         :return: A list of Scrapy Request objects
         """
         requests = []
-        for parser in self.__parsers:
+        for parser in self.__sources:
             parser_requests = parser.new_compound_request(compound)
             if parser_requests is not None:
                 requests.append(parser_requests)
@@ -58,18 +58,18 @@ class FourmiSpider(Spider):
             requests.extend(self.get_synonym_requests(synonym))
         return requests
 
-    def add_parsers(self, parsers):
+    def add_sources(self, sources):
         """
-        A function to add a new Parser objects to the list of available parsers.
-        :param parsers: A list of Parser Objects.
+        A function to add a new Parser objects to the list of available sources.
+        :param sources: A list of Source Objects.
         """
-        for parser in parsers:
-            self.add_parser(parser)
+        for parser in sources:
+            self.add_source(parser)
 
-    def add_parser(self, parser):
+    def add_source(self, source):
         """
         A function add a new Parser object to the list of available parsers.
-        :param parser: A Parser Object
+        :param source: A Source Object
         """
-        self.__parsers.append(parser)
-        parser.set_spider(self)
\ No newline at end of file
+        self.__sources.append(source)
+        source.set_spider(self)
\ No newline at end of file
diff --git a/fourmi.py b/fourmi.py
index 9f32cff..945c8a2 100755
--- a/fourmi.py
+++ b/fourmi.py
@@ -42,7 +42,7 @@ def setup_crawler(compound, settings, source_loader, attributes):
     :param attributes: A list of regular expressions which the attribute names should match.
     """
     spider = FourmiSpider(compound=compound, selected_attributes=attributes)
-    spider.add_parsers(source_loader.sources)
+    spider.add_sources(source_loader.sources)
     crawler = Crawler(settings)
     crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
     crawler.configure()

From f7d0fb4a450c10ab6ce147406f216e537f474c32 Mon Sep 17 00:00:00 2001
From: "Jip J. Dekker" <jip@dekker.li>
Date: Sun, 1 Jun 2014 20:24:54 +0200
Subject: [PATCH 37/39] Added documentation to the basic Source

---
 FourmiCrawler/sources/source.py | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/FourmiCrawler/sources/source.py b/FourmiCrawler/sources/source.py
index 3c51724..1ac0b9e 100644
--- a/FourmiCrawler/sources/source.py
+++ b/FourmiCrawler/sources/source.py
@@ -7,15 +7,32 @@ class Source:
     _spider = None
 
     def __init__(self):
+        """
+        Initiation of a new Source
+        """
         pass
 
     def parse(self, reponse):
+        """
+        This function should be able to parse all Scrapy Response objects with a URL matching the website Regex.
+        :param reponse: A Scrapy Response object
+        :return: A list of Result items and new Scrapy Requests
+        """
         log.msg("The parse function of the empty parser was used.", level=log.WARNING)
         pass
 
     def new_compound_request(self, compound):
+        """
+        This function should return a Scrapy Request for the given compound request.
+        :param compound: A compound name.
+        :return: A new Scrapy Request
+        """
         # return Request(url=self.website[:-1] + compound, callback=self.parse)
         pass
 
     def set_spider(self, spider):
+        """
+        A Function to save the associated spider.
+        :param spider: A FourmiSpider object
+        """
         self._spider = spider

From f81b1c950074a8ab181b3f91034f58db9c2b8c54 Mon Sep 17 00:00:00 2001
From: "Jip J. Dekker" <jip@dekker.li>
Date: Sun, 1 Jun 2014 20:25:46 +0200
Subject: [PATCH 38/39] Fixed a typo

---
 FourmiCrawler/sources/source.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/FourmiCrawler/sources/source.py b/FourmiCrawler/sources/source.py
index 1ac0b9e..d289d72 100644
--- a/FourmiCrawler/sources/source.py
+++ b/FourmiCrawler/sources/source.py
@@ -12,13 +12,13 @@ class Source:
         """
         pass
 
-    def parse(self, reponse):
+    def parse(self, response):
         """
         This function should be able to parse all Scrapy Response objects with a URL matching the website Regex.
-        :param reponse: A Scrapy Response object
+        :param response: A Scrapy Response object
         :return: A list of Result items and new Scrapy Requests
         """
-        log.msg("The parse function of the empty parser was used.", level=log.WARNING)
+        log.msg("The parse function of the empty source was used.", level=log.WARNING)
         pass
 
     def new_compound_request(self, compound):

From aac0a7c79c661db1c452bc5d31c9b2c77589701c Mon Sep 17 00:00:00 2001
From: "Jip J. Dekker" <jip@dekker.li>
Date: Sun, 1 Jun 2014 20:29:51 +0200
Subject: [PATCH 39/39] References to the main Scrapy documentation

---
 FourmiCrawler/items.py     | 4 +---
 FourmiCrawler/pipelines.py | 7 +++----
 2 files changed, 4 insertions(+), 7 deletions(-)

diff --git a/FourmiCrawler/items.py b/FourmiCrawler/items.py
index c7fd41c..9f9a516 100644
--- a/FourmiCrawler/items.py
+++ b/FourmiCrawler/items.py
@@ -1,6 +1,4 @@
-# Define here the models for your scraped items
-#
-# See documentation in:
+# For more information on item definitions, see the Scrapy documentation in:
 # http://doc.scrapy.org/en/latest/topics/items.html
 
 from scrapy.item import Item, Field
diff --git a/FourmiCrawler/pipelines.py b/FourmiCrawler/pipelines.py
index e1dadbf..ff7ceed 100644
--- a/FourmiCrawler/pipelines.py
+++ b/FourmiCrawler/pipelines.py
@@ -1,8 +1,7 @@
-# Define your item pipelines here
-#
-# Don't forget to add your pipeline to the ITEM_PIPELINES setting
-# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
+# For more information on item pipelines, see the Scrapy documentation in:
+# http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 import re
+
 from scrapy.exceptions import DropItem