From f728dff6b09614f98b51b756c3bbd4b7f3cda12f Mon Sep 17 00:00:00 2001
From: Nout van Deijck <nout@noutweb.com>
Date: Wed, 14 May 2014 12:01:05 +0200
Subject: [PATCH 01/16] Developing PubChem parser, first draft, not tested nor
 finished completely

---
 FourmiCrawler/sources/PubChem.py | 84 ++++++++++++++++++++++++++++++++
 1 file changed, 84 insertions(+)
 create mode 100644 FourmiCrawler/sources/PubChem.py

diff --git a/FourmiCrawler/sources/PubChem.py b/FourmiCrawler/sources/PubChem.py
new file mode 100644
index 0000000..00b2cd7
--- /dev/null
+++ b/FourmiCrawler/sources/PubChem.py
@@ -0,0 +1,84 @@
+from scrapy.http import Request
+from scrapy import log
+from source import Source
+from scrapy.selector import Selector
+from FourmiCrawler.items import Result
+import re
+
+
+class PubChem(Source):
+    """ PubChem scraper for chemical properties
+
+        This parser parses the part on PubChem pages that gives Chemical and Physical properties of a substance.
+    """
+
+    # TO DO: make url variable with help of PubChem identifier ID given by Wikipedia
+
+    #website = "https://pubchem.ncbi.nlm.nih.gov/summary/summary.cgi?cid=297"            #contains name of compound but not all parsable data
+    website = "https://pubchem.ncbi.nlm.nih.gov/toc/summary_toc.cgi?tocid=27&cid=297"  #contains properties to parse
+
+    __spider = None
+    searched_compounds = []
+
+    def __init__(self):
+        Source.__init__(self)
+
+    def parse(self, response):
+        """ Distributes the above described behaviour """
+        log.msg('A response from %s just arrived!' % response.url, level=log.DEBUG)
+        sel = Selector(response)
+        compound = sel.xpath('//h1/text()').extract()[0]
+        if compound in self.searched_compounds:
+            return None
+        else:
+            items = self.parse_properties(sel)
+            self.searched_compounds.append(compound)
+            return items
+
+    def parse_properties(self, sel):
+        """ scrape data from 'Chemical and Physical Properties' box on PubChem. """
+        items = []
+
+
+        prop_names = sel.xpath('.//div[@id="d27"//div/b').\
+            xpath('normalize-space(string())')
+        prop_values = sel.xpath('.//div[@id="d27"//div/a').\
+            xpath('normalize-space(string())')
+        prop_sources = sel.xpath('.//div[@id="d27"//div/a[@title]').\
+            xpath('normalize-space(string())')
+
+        for i, prop_name in enumerate(prop_names):
+            item = Result({
+                'attribute': prop_name.extract().encode('utf-8'),
+                'value': prop_values[i].extract().encode('utf-8'),
+                'source': "PubChem: " + prop_sources[i].extract().encode('utf-8'),
+                'reliability': "",
+                'conditions': ""
+            })
+            items.append(item)
+
+            print item
+
+            log.msg('PubChem prop: |%s| |%s| |%s|' % (item['attribute'], item['value'], item['source']), level=log.DEBUG)
+
+        items = filter(lambda a: a['value'] != '', items)  # remove items with an empty value
+        # item_list = self.clean_items(items)
+
+
+        return items
+
+    def new_compound_request(self, compound):
+        return Request(url=self.website[:-1] + compound, callback=self.parse)
+
+    # @staticmethod
+    # def clean_items(items):
+    #     """ clean up properties using regex, makes it possible to split the values from the units """
+    #     for item in items:
+    #         value = item['value']
+    #         m = re.search('F;\s(\d+[\.,]?\d*)', value)  # clean up numerical Kelvin value (after F)
+    #         if m:
+    #             item['value'] = m.group(1) + " K"
+    #         m = re.match('(\d+[\.,]?\d*)\sJ\sK.+mol', value)  # clean up J/K/mol values
+    #         if m:
+    #             item['value'] = m.group(1) + " J/K/mol"
+    #     return items

From 84f2e3dbea9a2f137bf7c441bb347313cccdf11d Mon Sep 17 00:00:00 2001
From: Nout van Deijck <nout@noutweb.com>
Date: Wed, 21 May 2014 14:53:51 +0200
Subject: [PATCH 02/16] Testing search function PubChem

---
 FourmiCrawler/sources/PubChem.py | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/FourmiCrawler/sources/PubChem.py b/FourmiCrawler/sources/PubChem.py
index 00b2cd7..d34a2cb 100644
--- a/FourmiCrawler/sources/PubChem.py
+++ b/FourmiCrawler/sources/PubChem.py
@@ -12,10 +12,16 @@ class PubChem(Source):
         This parser parses the part on PubChem pages that gives Chemical and Physical properties of a substance.
     """
 
-    # TO DO: make url variable with help of PubChem identifier ID given by Wikipedia
+    # TO DO: make url variable with help of PubChem identifier ID / cid
 
     #website = "https://pubchem.ncbi.nlm.nih.gov/summary/summary.cgi?cid=297"            #contains name of compound but not all parsable data
-    website = "https://pubchem.ncbi.nlm.nih.gov/toc/summary_toc.cgi?tocid=27&cid=297"  #contains properties to parse
+    # website = "https://pubchem.ncbi.nlm.nih.gov/toc/summary_toc.cgi?tocid=27&cid=297"  #contains properties to parse
+
+
+    website = 'https://www.ncbi.nlm.nih.gov/*'
+
+
+    search = 'pccompound?term=%s'
 
     __spider = None
     searched_compounds = []
@@ -31,8 +37,10 @@ class PubChem(Source):
         if compound in self.searched_compounds:
             return None
         else:
-            items = self.parse_properties(sel)
+            # items = self.parse_properties(sel)
+            items = []
             self.searched_compounds.append(compound)
+            print items
             return items
 
     def parse_properties(self, sel):
@@ -68,7 +76,7 @@ class PubChem(Source):
         return items
 
     def new_compound_request(self, compound):
-        return Request(url=self.website[:-1] + compound, callback=self.parse)
+        return Request(url=self.website[:-1] + self.search % compound, callback=self.parse)
 
     # @staticmethod
     # def clean_items(items):

From 4b377bb9a966e4b1fd82101e865d70fae0c30b1c Mon Sep 17 00:00:00 2001
From: Nout van Deijck <nout@noutweb.com>
Date: Wed, 21 May 2014 15:25:55 +0200
Subject: [PATCH 03/16] PubChem now scrapes its synonyms

---
 FourmiCrawler/sources/PubChem.py | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/FourmiCrawler/sources/PubChem.py b/FourmiCrawler/sources/PubChem.py
index d34a2cb..0ce727f 100644
--- a/FourmiCrawler/sources/PubChem.py
+++ b/FourmiCrawler/sources/PubChem.py
@@ -19,12 +19,10 @@ class PubChem(Source):
 
 
     website = 'https://www.ncbi.nlm.nih.gov/*'
-
-
     search = 'pccompound?term=%s'
 
     __spider = None
-    searched_compounds = []
+    searched_compounds = set()
 
     def __init__(self):
         Source.__init__(self)
@@ -34,12 +32,21 @@ class PubChem(Source):
         log.msg('A response from %s just arrived!' % response.url, level=log.DEBUG)
         sel = Selector(response)
         compound = sel.xpath('//h1/text()').extract()[0]
+        raw_synonyms = sel.xpath('//div[@class="smalltext"]/text()').extract()[0]
+        for synonym in raw_synonyms.strip().split(', '):
+            log.msg('PubChem synonym found: %s' % synonym, level=log.DEBUG)
+            self.searched_compounds.update(synonym)
+            self._spider.get_synonym_requests(synonym)
+
+
+        log.msg('Raw synonyms found: %s' % raw_synonyms, level=log.DEBUG)
+
         if compound in self.searched_compounds:
             return None
         else:
             # items = self.parse_properties(sel)
             items = []
-            self.searched_compounds.append(compound)
+            self.searched_compounds.update(compound)
             print items
             return items
 

From fb41d772f203b420784582732ea64fd45d96c51d Mon Sep 17 00:00:00 2001
From: Nout van Deijck <nout@noutweb.com>
Date: Wed, 21 May 2014 16:11:02 +0200
Subject: [PATCH 04/16] Added custom user-agent because otherwise it would
 block, because not amused by scraper

---
 FourmiCrawler/settings.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/FourmiCrawler/settings.py b/FourmiCrawler/settings.py
index be91fef..490a3a5 100644
--- a/FourmiCrawler/settings.py
+++ b/FourmiCrawler/settings.py
@@ -16,6 +16,8 @@ ITEM_PIPELINES = {
 FEED_URI = 'results.json'
 FEED_FORMAT = 'jsonlines'
 
+USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.137 Safari/537.36'
+
 
 # Crawl responsibly by identifying yourself (and your website) on the
 # user-agent

From 8083d0c7bc03459de2aab224a811653389aa0ebf Mon Sep 17 00:00:00 2001
From: Nout van Deijck <nout@noutweb.com>
Date: Wed, 21 May 2014 16:11:48 +0200
Subject: [PATCH 05/16] PubChem scrapes synonyms, gets custom url to get data
 on properties from

---
 FourmiCrawler/sources/PubChem.py | 40 ++++++++++++++++++++++----------
 1 file changed, 28 insertions(+), 12 deletions(-)

diff --git a/FourmiCrawler/sources/PubChem.py b/FourmiCrawler/sources/PubChem.py
index 0ce727f..e2dcc8b 100644
--- a/FourmiCrawler/sources/PubChem.py
+++ b/FourmiCrawler/sources/PubChem.py
@@ -18,8 +18,11 @@ class PubChem(Source):
     # website = "https://pubchem.ncbi.nlm.nih.gov/toc/summary_toc.cgi?tocid=27&cid=297"  #contains properties to parse
 
 
-    website = 'https://www.ncbi.nlm.nih.gov/*'
+    website = 'https://*.ncbi.nlm.nih.gov/*'
+    website_www = 'https://www.ncbi.nlm.nih.gov/*'
+    website_pubchem = 'https://pubchem.ncbi.nlm.nih.gov/*'
     search = 'pccompound?term=%s'
+    data_url = 'toc/summary_toc.cgi?tocid=27&cid=%s'
 
     __spider = None
     searched_compounds = set()
@@ -29,26 +32,39 @@ class PubChem(Source):
 
     def parse(self, response):
         """ Distributes the above described behaviour """
+        requests = []
         log.msg('A response from %s just arrived!' % response.url, level=log.DEBUG)
+
         sel = Selector(response)
         compound = sel.xpath('//h1/text()').extract()[0]
+        if compound in self.searched_compounds:
+            return None
+
+        self.searched_compounds.update(compound)
         raw_synonyms = sel.xpath('//div[@class="smalltext"]/text()').extract()[0]
         for synonym in raw_synonyms.strip().split(', '):
             log.msg('PubChem synonym found: %s' % synonym, level=log.DEBUG)
             self.searched_compounds.update(synonym)
             self._spider.get_synonym_requests(synonym)
-
-
         log.msg('Raw synonyms found: %s' % raw_synonyms, level=log.DEBUG)
 
-        if compound in self.searched_compounds:
-            return None
-        else:
-            # items = self.parse_properties(sel)
-            items = []
-            self.searched_compounds.update(compound)
-            print items
-            return items
+        n = re.search(r'cid=(\d+)',response.url)
+        if n:
+            cid = n.group(1)
+        log.msg('cid: %s' % cid, level=log.DEBUG)
+        requests.append(Request(url=self.website_pubchem[:-1] + self.data_url % cid, callback=self.parse_data))
+
+        return requests
+
+    def parse_data(self, response):
+        log.msg('parsing data', level=log.DEBUG)
+        requests = []
+
+
+
+
+        return requests
+
 
     def parse_properties(self, sel):
         """ scrape data from 'Chemical and Physical Properties' box on PubChem. """
@@ -83,7 +99,7 @@ class PubChem(Source):
         return items
 
     def new_compound_request(self, compound):
-        return Request(url=self.website[:-1] + self.search % compound, callback=self.parse)
+        return Request(url=self.website_www[:-1] + self.search % compound, callback=self.parse)
 
     # @staticmethod
     # def clean_items(items):

From ba8f8451786088c12b4645f61261ab4e8d96598b Mon Sep 17 00:00:00 2001
From: Nout van Deijck <nout@noutweb.com>
Date: Mon, 2 Jun 2014 09:26:36 +0200
Subject: [PATCH 06/16] now also (finally) scrapes property values and names,
 but not yet coupled together and not yet returned.

---
 FourmiCrawler/sources/PubChem.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/FourmiCrawler/sources/PubChem.py b/FourmiCrawler/sources/PubChem.py
index e2dcc8b..6718900 100644
--- a/FourmiCrawler/sources/PubChem.py
+++ b/FourmiCrawler/sources/PubChem.py
@@ -60,12 +60,20 @@ class PubChem(Source):
         log.msg('parsing data', level=log.DEBUG)
         requests = []
 
+        sel = Selector(response)
+        # props = sel.xpath('.//div')
+        prop_values = sel.xpath('//div//a/text()').extract()
+        prop_names = sel.xpath('//div//a/ancestor::div/b/text()').extract()
 
+        print prop_values
+        print prop_names
 
+        # print props
 
         return requests
 
 
+    # this (old) definition is only here to help myself
     def parse_properties(self, sel):
         """ scrape data from 'Chemical and Physical Properties' box on PubChem. """
         items = []
@@ -95,9 +103,9 @@ class PubChem(Source):
         items = filter(lambda a: a['value'] != '', items)  # remove items with an empty value
         # item_list = self.clean_items(items)
 
-
         return items
 
+
     def new_compound_request(self, compound):
         return Request(url=self.website_www[:-1] + self.search % compound, callback=self.parse)
 

From 291547a5addfb5f79dd8bcc0cb80c798f20f05db Mon Sep 17 00:00:00 2001
From: Nout van Deijck <nout@noutweb.com>
Date: Wed, 4 Jun 2014 15:44:53 +0200
Subject: [PATCH 07/16] now returns good results, with property values and
 corresponding sources

---
 FourmiCrawler/sources/PubChem.py | 34 +++++++++++++++++++++++++-------
 1 file changed, 27 insertions(+), 7 deletions(-)

diff --git a/FourmiCrawler/sources/PubChem.py b/FourmiCrawler/sources/PubChem.py
index 6718900..1d20231 100644
--- a/FourmiCrawler/sources/PubChem.py
+++ b/FourmiCrawler/sources/PubChem.py
@@ -61,14 +61,34 @@ class PubChem(Source):
         requests = []
 
         sel = Selector(response)
-        # props = sel.xpath('.//div')
-        prop_values = sel.xpath('//div//a/text()').extract()
-        prop_names = sel.xpath('//div//a/ancestor::div/b/text()').extract()
+        props = sel.xpath('//div')
 
-        print prop_values
-        print prop_names
-
-        # print props
+        for prop in props:
+            prop_name = ''.join(prop.xpath('b/text()').extract())
+            if prop.xpath('a'):
+                prop_source = ''.join(prop.xpath('a/@title').extract())
+                prop_value = ''.join(prop.xpath('a/text()').extract())
+                new_prop = Result({
+                    'attribute': prop_name,
+                    'value': prop_value,
+                    'source': prop_source,
+                    'reliability': 'Unknown',
+                    'conditions': ''
+                })
+                requests.append(new_prop)
+            elif prop.xpath('ul'):
+                prop_values = prop.xpath('ul//li')
+                for prop_li in prop_values:
+                    prop_value = ''.join(prop_li.xpath('a/text()').extract())
+                    prop_source = ''.join(prop_li.xpath('a/@title').extract())
+                    new_prop = Result({
+                        'attribute': prop_name,
+                        'value': prop_value,
+                        'source': prop_source,
+                        'reliability': 'Unknown',
+                        'conditions': ''
+                    })
+                    requests.append(new_prop)
 
         return requests
 

From f1047405667c789b1a1c4238ae84eeac10834cfe Mon Sep 17 00:00:00 2001
From: Nout van Deijck <nout@noutweb.com>
Date: Wed, 11 Jun 2014 16:39:00 +0200
Subject: [PATCH 08/16] cleaned up useless code

---
 FourmiCrawler/sources/PubChem.py | 54 +-------------------------------
 1 file changed, 1 insertion(+), 53 deletions(-)

diff --git a/FourmiCrawler/sources/PubChem.py b/FourmiCrawler/sources/PubChem.py
index 1d20231..6490b20 100644
--- a/FourmiCrawler/sources/PubChem.py
+++ b/FourmiCrawler/sources/PubChem.py
@@ -12,12 +12,6 @@ class PubChem(Source):
         This parser parses the part on PubChem pages that gives Chemical and Physical properties of a substance.
     """
 
-    # TO DO: make url variable with help of PubChem identifier ID / cid
-
-    #website = "https://pubchem.ncbi.nlm.nih.gov/summary/summary.cgi?cid=297"            #contains name of compound but not all parsable data
-    # website = "https://pubchem.ncbi.nlm.nih.gov/toc/summary_toc.cgi?tocid=27&cid=297"  #contains properties to parse
-
-
     website = 'https://*.ncbi.nlm.nih.gov/*'
     website_www = 'https://www.ncbi.nlm.nih.gov/*'
     website_pubchem = 'https://pubchem.ncbi.nlm.nih.gov/*'
@@ -93,51 +87,5 @@ class PubChem(Source):
         return requests
 
 
-    # this (old) definition is only here to help myself
-    def parse_properties(self, sel):
-        """ scrape data from 'Chemical and Physical Properties' box on PubChem. """
-        items = []
-
-
-        prop_names = sel.xpath('.//div[@id="d27"//div/b').\
-            xpath('normalize-space(string())')
-        prop_values = sel.xpath('.//div[@id="d27"//div/a').\
-            xpath('normalize-space(string())')
-        prop_sources = sel.xpath('.//div[@id="d27"//div/a[@title]').\
-            xpath('normalize-space(string())')
-
-        for i, prop_name in enumerate(prop_names):
-            item = Result({
-                'attribute': prop_name.extract().encode('utf-8'),
-                'value': prop_values[i].extract().encode('utf-8'),
-                'source': "PubChem: " + prop_sources[i].extract().encode('utf-8'),
-                'reliability': "",
-                'conditions': ""
-            })
-            items.append(item)
-
-            print item
-
-            log.msg('PubChem prop: |%s| |%s| |%s|' % (item['attribute'], item['value'], item['source']), level=log.DEBUG)
-
-        items = filter(lambda a: a['value'] != '', items)  # remove items with an empty value
-        # item_list = self.clean_items(items)
-
-        return items
-
-
     def new_compound_request(self, compound):
-        return Request(url=self.website_www[:-1] + self.search % compound, callback=self.parse)
-
-    # @staticmethod
-    # def clean_items(items):
-    #     """ clean up properties using regex, makes it possible to split the values from the units """
-    #     for item in items:
-    #         value = item['value']
-    #         m = re.search('F;\s(\d+[\.,]?\d*)', value)  # clean up numerical Kelvin value (after F)
-    #         if m:
-    #             item['value'] = m.group(1) + " K"
-    #         m = re.match('(\d+[\.,]?\d*)\sJ\sK.+mol', value)  # clean up J/K/mol values
-    #         if m:
-    #             item['value'] = m.group(1) + " J/K/mol"
-    #     return items
+        return Request(url=self.website_www[:-1] + self.search % compound, callback=self.parse)
\ No newline at end of file

From a903e78f9ebe4f855c9ffc0d74ce4faa95831c4f Mon Sep 17 00:00:00 2001
From: Nout van Deijck <nout@noutweb.com>
Date: Wed, 11 Jun 2014 16:40:32 +0200
Subject: [PATCH 09/16] added PubChem to sources.cfg

---
 sources.cfg | 15 +++++++++++++++
 1 file changed, 15 insertions(+)
 create mode 100644 sources.cfg

diff --git a/sources.cfg b/sources.cfg
new file mode 100644
index 0000000..a9fa2fb
--- /dev/null
+++ b/sources.cfg
@@ -0,0 +1,15 @@
+[DEFAULT]
+reliability = Unknown
+
+[ChemSpider]
+reliability = High
+token = 052bfd06-5ce4-43d6-bf12-89eabefd2338
+
+[NIST]
+reliability = High
+
+[WikipediaParser]
+reliability = Medium
+
+[PubChem]
+reliability = High
\ No newline at end of file

From 8836cdf16b758b86bc1e20402b85b2c3d4b11990 Mon Sep 17 00:00:00 2001
From: RTB <robtberge@gmail.com>
Date: Wed, 11 Jun 2014 18:39:01 +0200
Subject: [PATCH 10/16] fixed config errors due to merge with develop

---
 FourmiCrawler/sources/PubChem.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/FourmiCrawler/sources/PubChem.py b/FourmiCrawler/sources/PubChem.py
index 6490b20..ab6a99e 100644
--- a/FourmiCrawler/sources/PubChem.py
+++ b/FourmiCrawler/sources/PubChem.py
@@ -21,8 +21,9 @@ class PubChem(Source):
     __spider = None
     searched_compounds = set()
 
-    def __init__(self):
-        Source.__init__(self)
+    def __init__(self, config):
+        Source.__init__(self, config)
+        self.cfg = config
 
     def parse(self, response):
         """ Distributes the above described behaviour """
@@ -88,4 +89,4 @@ class PubChem(Source):
 
 
     def new_compound_request(self, compound):
-        return Request(url=self.website_www[:-1] + self.search % compound, callback=self.parse)
\ No newline at end of file
+        return Request(url=self.website_www[:-1] + self.search % compound, callback=self.parse)

From 4dc557d9e8e7bb5ac529e0201f577e23aeca29cb Mon Sep 17 00:00:00 2001
From: Nout van Deijck <nout@noutweb.com>
Date: Tue, 17 Jun 2014 00:09:17 +0200
Subject: [PATCH 11/16] Finish plugin (comments, log messages, etc)

---
 FourmiCrawler/sources/PubChem.py | 33 +++++++++++++++++++++++++-------
 1 file changed, 26 insertions(+), 7 deletions(-)

diff --git a/FourmiCrawler/sources/PubChem.py b/FourmiCrawler/sources/PubChem.py
index ab6a99e..fc8250b 100644
--- a/FourmiCrawler/sources/PubChem.py
+++ b/FourmiCrawler/sources/PubChem.py
@@ -9,9 +9,11 @@ import re
 class PubChem(Source):
     """ PubChem scraper for chemical properties
 
-        This parser parses the part on PubChem pages that gives Chemical and Physical properties of a substance.
+        This parser parses the part on PubChem pages that gives Chemical and Physical properties of a substance,
+        including sources of the values of properties.
     """
 
+    #PubChem has its data on compound name, properties and their values on different html pages, so different URLs used
     website = 'https://*.ncbi.nlm.nih.gov/*'
     website_www = 'https://www.ncbi.nlm.nih.gov/*'
     website_pubchem = 'https://pubchem.ncbi.nlm.nih.gov/*'
@@ -26,7 +28,11 @@ class PubChem(Source):
         self.cfg = config
 
     def parse(self, response):
-        """ Distributes the above described behaviour """
+        """
+        Distributes the above described behaviour
+        :param response: The incoming search request
+        :return Returns the found properties if response is unique or returns none if it's already known
+        """
         requests = []
         log.msg('A response from %s just arrived!' % response.url, level=log.DEBUG)
 
@@ -46,12 +52,19 @@ class PubChem(Source):
         n = re.search(r'cid=(\d+)',response.url)
         if n:
             cid = n.group(1)
-        log.msg('cid: %s' % cid, level=log.DEBUG)
-        requests.append(Request(url=self.website_pubchem[:-1] + self.data_url % cid, callback=self.parse_data))
+        log.msg('cid: %s' % cid, level=log.DEBUG)   #getting the right id of the compound with which it can reach
+                                                # the seperate html page which contains the properties and their values
 
+        #using this cid to get the right url and scrape it
+        requests.append(Request(url=self.website_pubchem[:-1] + self.data_url % cid, callback=self.parse_data))
         return requests
 
     def parse_data(self, response):
+        """
+        Parse data found in 'Chemical and Physical properties' part of a substance page.
+        :param response: The response with the page to parse
+        :return: requests: Returns a list of properties with their values, source, etc.
+        """
         log.msg('parsing data', level=log.DEBUG)
         requests = []
 
@@ -59,8 +72,8 @@ class PubChem(Source):
         props = sel.xpath('//div')
 
         for prop in props:
-            prop_name = ''.join(prop.xpath('b/text()').extract())
-            if prop.xpath('a'):
+            prop_name = ''.join(prop.xpath('b/text()').extract()) # name of property that it is parsing
+            if prop.xpath('a'):     # parsing for single value in property
                 prop_source = ''.join(prop.xpath('a/@title').extract())
                 prop_value = ''.join(prop.xpath('a/text()').extract())
                 new_prop = Result({
@@ -70,8 +83,11 @@ class PubChem(Source):
                     'reliability': 'Unknown',
                     'conditions': ''
                 })
+                log.msg('PubChem prop: |%s| |%s| |%s|' %
+                        (new_prop['attribute'], new_prop['value'],
+                         new_prop['source']), level=log.DEBUG)
                 requests.append(new_prop)
-            elif prop.xpath('ul'):
+            elif prop.xpath('ul'):    # parsing for multiple values (list) in property
                 prop_values = prop.xpath('ul//li')
                 for prop_li in prop_values:
                     prop_value = ''.join(prop_li.xpath('a/text()').extract())
@@ -83,6 +99,9 @@ class PubChem(Source):
                         'reliability': 'Unknown',
                         'conditions': ''
                     })
+                    log.msg('PubChem prop: |%s| |%s| |%s|' %
+                        (new_prop['attribute'], new_prop['value'],
+                         new_prop['source']), level=log.DEBUG)
                     requests.append(new_prop)
 
         return requests

From 56e1d3cfb6a785b3a2b444a93eeca2fb02b2be88 Mon Sep 17 00:00:00 2001
From: "Jip J. Dekker" <jip@dekker.li>
Date: Tue, 17 Jun 2014 00:28:01 +0200
Subject: [PATCH 12/16] No cofig files should be included on github

---
 sources.cfg | 15 ---------------
 1 file changed, 15 deletions(-)
 delete mode 100644 sources.cfg

diff --git a/sources.cfg b/sources.cfg
deleted file mode 100644
index a9fa2fb..0000000
--- a/sources.cfg
+++ /dev/null
@@ -1,15 +0,0 @@
-[DEFAULT]
-reliability = Unknown
-
-[ChemSpider]
-reliability = High
-token = 052bfd06-5ce4-43d6-bf12-89eabefd2338
-
-[NIST]
-reliability = High
-
-[WikipediaParser]
-reliability = Medium
-
-[PubChem]
-reliability = High
\ No newline at end of file

From 6e16e9f23e19016ac5a5d3eff3dd4e07cdf9e8c8 Mon Sep 17 00:00:00 2001
From: "Jip J. Dekker" <jip@dekker.li>
Date: Tue, 17 Jun 2014 00:33:08 +0200
Subject: [PATCH 13/16] TODO on sppofing user agent

---
 FourmiCrawler/settings.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/FourmiCrawler/settings.py b/FourmiCrawler/settings.py
index 320f573..338f224 100644
--- a/FourmiCrawler/settings.py
+++ b/FourmiCrawler/settings.py
@@ -18,10 +18,10 @@ ITEM_PIPELINES = {
 FEED_URI = 'results.json'
 FEED_FORMAT = 'jsonlines'
 
-USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.137 Safari/537.36'
-
-
 # Crawl responsibly by identifying yourself (and your website) on the
 # user-agent
 
+# [todo] - Check for repercussions on spoofing the user agent
+
 # USER_AGENT = 'FourmiCrawler (+http://www.yourdomain.com)'
+USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.137 Safari/537.36'

From 25bf003bdbda36095bc5d972820bfb5666c8765c Mon Sep 17 00:00:00 2001
From: "Jip J. Dekker" <jip@dekker.li>
Date: Tue, 17 Jun 2014 00:35:50 +0200
Subject: [PATCH 14/16] Added pubchem to changelod

---
 Changelog.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/Changelog.md b/Changelog.md
index 99d61fb..b1885f6 100644
--- a/Changelog.md
+++ b/Changelog.md
@@ -1,6 +1,7 @@
 ### v0.5.3
 - FIX: It is now again possible to use both verbose and the source inclusion/exclusion options
 - FIX: Logging is now "actually" disabled if not using the verbose option.
+- FEATURE: Added support for PubChem
 
 ### v0.5.2 
 - FIX: Signatured used to contain untracked and older files, current signature
@@ -8,4 +9,4 @@ should be correct.
 
 ### v0.5.1
 - UPDATED: Logging functionality from command line
-- DEV: Code cleanup and extra tests
\ No newline at end of file
+- DEV: Code cleanup and extra tests

From bb62c335d2872d16d40e04830646adc6df59d20a Mon Sep 17 00:00:00 2001
From: "Jip J. Dekker" <jip@dekker.li>
Date: Tue, 17 Jun 2014 00:36:31 +0200
Subject: [PATCH 15/16] Bumped version number

---
 fourmi.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fourmi.py b/fourmi.py
index 9408818..86f2808 100755
--- a/fourmi.py
+++ b/fourmi.py
@@ -69,7 +69,7 @@ def search(docopt_arguments, source_loader):
 
 # The start for the Fourmi Command Line interface.
 if __name__ == '__main__':
-    arguments = docopt.docopt(__doc__, version='Fourmi - V0.5.2')
+    arguments = docopt.docopt(__doc__, version='Fourmi - V0.5.3')
     loader = SourceLoader()
 
     if arguments["--include"]:

From 35fe51d9161ba1d9bc2147125c54e0fb701008ea Mon Sep 17 00:00:00 2001
From: "Jip J. Dekker" <jip@dekker.li>
Date: Tue, 17 Jun 2014 00:37:34 +0200
Subject: [PATCH 16/16] Signed the new version

---
 SIGNED.md | 33 +++++++++++++++++----------------
 1 file changed, 17 insertions(+), 16 deletions(-)

diff --git a/SIGNED.md b/SIGNED.md
index 35d0887..3fc4507 100644
--- a/SIGNED.md
+++ b/SIGNED.md
@@ -3,19 +3,19 @@
 -----BEGIN PGP SIGNATURE-----
 Version: GnuPG v1.4.11 (GNU/Linux)
 
-iQIcBAABAgAGBQJTnfIhAAoJEJrQ9RIUCT6/SbIQANKLzmkxwH11vM84kkRbmgHE
-d3jLYYNEDQArCTOObYxvyrvE0BK2fhzbdBfccO9rLqu19FnBhcN3WLbkb/WM+2af
-G8GkC7yFsWPs1lkrBbouvObPmqwVChGhRETd7xNU6D1NRGKLDT9lXv1FkjU2qt6P
-CQwF129aTRzCZ9XGoVKG9wnKuaPm2EYkYHKlG3eck+eeKklTlmJcGi5ON7iGsUpE
-hNVrSg8WwN4SzpOEgXlyBn9Zzci81XeZqy3Fnp7u1CEq5tOuWITXa1i5wQ9Jq/2n
-5HP0XLbY5grW6Cpqh5jDUiX/XnNtCwpPWRnz4lCLswwMIDLCpq5tJubIay7GMvsx
-fV1+UUGAR1EcWNWI0R6XJNbb2EHzidDJcLWlVo1InJDxevECq3CNnh7fRC9bixiG
-EV0C/Abig/rvyX5cc9ozmwO3e0gzmtwwyywxOWLzJgVns3jfuA9MhaGDczIC1kuR
-Tig9ciByErhT6v8SjgS3gyhWc+tRSx5R3M1Y78CungW3c61VA3Jo/fWHY6Db0JwH
-9lVnGU4Ql4mbQQQAv7e/6r6ZhYwoBsAkOKdqT4Dn8aLaItZ8+oB2FXEl/P6V55hN
-ambDSt476mwJcyDyIIwxTLyqcop2zYBdaUATe8lwo+0OoXuCLfjnThkHzy2dA0CP
-xqHuzkM3Pdb6qOU3cUK7
-=PVt+
+iQIcBAABAgAGBQJTn3GgAAoJEJrQ9RIUCT6/CI4P/RSAQrd6JugGZoQu/gNdW6eB
+MYCybqYGZiieVhUaGOnFNVlp68YpXH+sP/Uc6hXEX30UQEsDmhMeT5NA7ZMS+zJ9
+MNHGQdJq22lGb3+VoVBV4RTMdkQXOXvx6p5biskjIEtM3tfTxP529GvAX2TFUNnt
+gGWk28EDr30M95XwDxwWo+57Xv8VtSb3VSvXEbrdwGYf8EoQo9oPtzYQ0YcdupcC
+ET8bukYVcwpAjoTnPlEy89TiHHohwmimr2ASXeQ64Ks5wfjzcF7NENCAmaAfR+KI
+VLLuGqdWMBx1ewVuAXTCZ0Mga/kBoRUaO0PC13UmL8LhhZY9Z3cwD4UnPU35/RQi
+IbLfQcZHf/gEvyMeiTYCsyWpm+/xxn1+EfHol4/Q9VSXzZgRBX05Ik6tqeCvjdgG
+4PyHBaJTTm/HfMNdg3mr1mbyjTv5UxglEyPv+Y4NdfoVfepkXsXbzvNSyVffZ3Bw
+UaFp7KzIC4Jugdpv63FleiAdDY0+iZ5shH86wD1+HJ0/a87kn5Ao1yESby7J7U+f
+poZQYeMFeuC0T5hY/3iYoyvZ68oH918ESESiucSulp5BvfwuqGL2+xo5uJIwGYXE
+3IDQC7xbA14JHX86IVJlSHAD33iWyiC+5yjw4/bRRVl37KPsLdHiXH3YIRnF5I2I
+ZbM/uDYyJdZbBe4UoCoF
+=AMhi
 -----END PGP SIGNATURE-----
 
 ```
@@ -31,22 +31,23 @@ size  exec  file                      contents
             ./                                                                                        
 375           .gitignore              d2e475a6a4fa51422cac0a07495914e776858fb9ab9c8937a4d491a3e042d6b1
 464           .travis.yml             3063ba078607b8d16bd6467afc15fbbaa4b26c1e30be5ce7cef453cfccbaa95c
-208           Changelog.md            370ecb699890e839e73e22822286b2b2ee7e7ec6c485908e10b8c30e7f9acd47
+428           Changelog.md            c7791d1914ddca9ff1549d90468a79787a7feafe94cecd756e3d7cbd4bcbc7df
               FourmiCrawler/                                                                          
 0               __init__.py           e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855
 304             items.py              b00d49a3d53fa13306c7f8b023adb93ab88423c4fce46600689814f6b02bb806
 2178            pipelines.py          f9b7b84938060751e15e45de5133dffe50c798bff2a20019206fe7c9d677ad49
-716             settings.py           37a8f63e123bccc77076d574617a522b30c1d7c5e893ec3d78cc40e1563dd8a6
+914             settings.py           0be2eaf8e83e85ed27754c896421180fc80cb5ce44449aa9f1048e465d1a96f2
                 sources/                                                                              
 9991              ChemSpider.py       847013e34c5c3683ec66a337837287512b4bab9fbea2ece12e4130ab0dbf264d
 9898              NIST.py             97abc84fce85c47b789822715a1945ab84cc052a32340c861141c1af66bab644
+4754              PubChem.py          58ed4c92519e385f2768cf8034b006b18f8a21632cb1c5a0849b1a329a8c6ffb
 6907              WikipediaParser.py  5d6de911c773129a34b76c40a9b547aafc67644a15f39cd0be6afc7a16fb0f97
 0                 __init__.py         e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855
 1262              source.py           16c4cdfca849b7dc2bc89d7a6f7ad021f4aa1d04234394312f1d0edf0fd9c5a4
 3026            spider.py             1ffba2512988b7a6b535a4a31a4ef688ece4f8c595c3d50355c34ef46b23e44a
 1081          LICENSE                 36951e5f1910bad3e008ab7228f35ad8933192e52d3c3ae6a5e875765e27192c
 3965          README.md               d21236d6a175be28ef8e2fee8a256e95b6a513163e3f1071c26c62e9093db7f3
-3659  x       fourmi.py               7b4202ecfc8726fcc3f211c459aada7f5610fa4c4c0a7b916e44fc12d71010a1
+3676  x       fourmi.py               2ff89f97fd2a49d08417d9ab6cf08e88944d0c45f54ec84550b530be48676c23
 261           scrapy.cfg              624c068fd06303daa65b8e0d0d3ef88ac1f123be2694ef5b4f3f9a9dcd983f85
               tests/                                                                                  
 1               __init__.py           01ba4719c80b6fe911b091a7c05124b64eeece964e09c058ef8f9805daca546b