From f728dff6b09614f98b51b756c3bbd4b7f3cda12f Mon Sep 17 00:00:00 2001
From: Nout van Deijck <nout@noutweb.com>
Date: Wed, 14 May 2014 12:01:05 +0200
Subject: [PATCH 01/13] Developing PubChem parser, first draft, not tested nor
 finished completely

---
 FourmiCrawler/sources/PubChem.py | 84 ++++++++++++++++++++++++++++++++
 1 file changed, 84 insertions(+)
 create mode 100644 FourmiCrawler/sources/PubChem.py

diff --git a/FourmiCrawler/sources/PubChem.py b/FourmiCrawler/sources/PubChem.py
new file mode 100644
index 0000000..00b2cd7
--- /dev/null
+++ b/FourmiCrawler/sources/PubChem.py
@@ -0,0 +1,84 @@
+from scrapy.http import Request
+from scrapy import log
+from source import Source
+from scrapy.selector import Selector
+from FourmiCrawler.items import Result
+import re
+
+
+class PubChem(Source):
+    """ PubChem scraper for chemical properties
+
+        This parser parses the part on PubChem pages that gives Chemical and Physical properties of a substance.
+    """
+
+    # TO DO: make url variable with help of PubChem identifier ID given by Wikipedia
+
+    #website = "https://pubchem.ncbi.nlm.nih.gov/summary/summary.cgi?cid=297"            #contains name of compound but not all parsable data
+    website = "https://pubchem.ncbi.nlm.nih.gov/toc/summary_toc.cgi?tocid=27&cid=297"  #contains properties to parse
+
+    __spider = None
+    searched_compounds = []
+
+    def __init__(self):
+        Source.__init__(self)
+
+    def parse(self, response):
+        """ Distributes the above described behaviour """
+        log.msg('A response from %s just arrived!' % response.url, level=log.DEBUG)
+        sel = Selector(response)
+        compound = sel.xpath('//h1/text()').extract()[0]
+        if compound in self.searched_compounds:
+            return None
+        else:
+            items = self.parse_properties(sel)
+            self.searched_compounds.append(compound)
+            return items
+
+    def parse_properties(self, sel):
+        """ scrape data from 'Chemical and Physical Properties' box on PubChem. """
+        items = []
+
+
+        prop_names = sel.xpath('.//div[@id="d27"//div/b').\
+            xpath('normalize-space(string())')
+        prop_values = sel.xpath('.//div[@id="d27"//div/a').\
+            xpath('normalize-space(string())')
+        prop_sources = sel.xpath('.//div[@id="d27"//div/a[@title]').\
+            xpath('normalize-space(string())')
+
+        for i, prop_name in enumerate(prop_names):
+            item = Result({
+                'attribute': prop_name.extract().encode('utf-8'),
+                'value': prop_values[i].extract().encode('utf-8'),
+                'source': "PubChem: " + prop_sources[i].extract().encode('utf-8'),
+                'reliability': "",
+                'conditions': ""
+            })
+            items.append(item)
+
+            print item
+
+            log.msg('PubChem prop: |%s| |%s| |%s|' % (item['attribute'], item['value'], item['source']), level=log.DEBUG)
+
+        items = filter(lambda a: a['value'] != '', items)  # remove items with an empty value
+        # item_list = self.clean_items(items)
+
+
+        return items
+
+    def new_compound_request(self, compound):
+        return Request(url=self.website[:-1] + compound, callback=self.parse)
+
+    # @staticmethod
+    # def clean_items(items):
+    #     """ clean up properties using regex, makes it possible to split the values from the units """
+    #     for item in items:
+    #         value = item['value']
+    #         m = re.search('F;\s(\d+[\.,]?\d*)', value)  # clean up numerical Kelvin value (after F)
+    #         if m:
+    #             item['value'] = m.group(1) + " K"
+    #         m = re.match('(\d+[\.,]?\d*)\sJ\sK.+mol', value)  # clean up J/K/mol values
+    #         if m:
+    #             item['value'] = m.group(1) + " J/K/mol"
+    #     return items

From 84f2e3dbea9a2f137bf7c441bb347313cccdf11d Mon Sep 17 00:00:00 2001
From: Nout van Deijck <nout@noutweb.com>
Date: Wed, 21 May 2014 14:53:51 +0200
Subject: [PATCH 02/13] Testing search function PubChem

---
 FourmiCrawler/sources/PubChem.py | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/FourmiCrawler/sources/PubChem.py b/FourmiCrawler/sources/PubChem.py
index 00b2cd7..d34a2cb 100644
--- a/FourmiCrawler/sources/PubChem.py
+++ b/FourmiCrawler/sources/PubChem.py
@@ -12,10 +12,16 @@ class PubChem(Source):
         This parser parses the part on PubChem pages that gives Chemical and Physical properties of a substance.
     """
 
-    # TO DO: make url variable with help of PubChem identifier ID given by Wikipedia
+    # TO DO: make url variable with help of PubChem identifier ID / cid
 
     #website = "https://pubchem.ncbi.nlm.nih.gov/summary/summary.cgi?cid=297"            #contains name of compound but not all parsable data
-    website = "https://pubchem.ncbi.nlm.nih.gov/toc/summary_toc.cgi?tocid=27&cid=297"  #contains properties to parse
+    # website = "https://pubchem.ncbi.nlm.nih.gov/toc/summary_toc.cgi?tocid=27&cid=297"  #contains properties to parse
+
+
+    website = 'https://www.ncbi.nlm.nih.gov/*'
+
+
+    search = 'pccompound?term=%s'
 
     __spider = None
     searched_compounds = []
@@ -31,8 +37,10 @@ class PubChem(Source):
         if compound in self.searched_compounds:
             return None
         else:
-            items = self.parse_properties(sel)
+            # items = self.parse_properties(sel)
+            items = []
             self.searched_compounds.append(compound)
+            print items
             return items
 
     def parse_properties(self, sel):
@@ -68,7 +76,7 @@ class PubChem(Source):
         return items
 
     def new_compound_request(self, compound):
-        return Request(url=self.website[:-1] + compound, callback=self.parse)
+        return Request(url=self.website[:-1] + self.search % compound, callback=self.parse)
 
     # @staticmethod
     # def clean_items(items):

From 4b377bb9a966e4b1fd82101e865d70fae0c30b1c Mon Sep 17 00:00:00 2001
From: Nout van Deijck <nout@noutweb.com>
Date: Wed, 21 May 2014 15:25:55 +0200
Subject: [PATCH 03/13] PubChem now scrapes its synonyms

---
 FourmiCrawler/sources/PubChem.py | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/FourmiCrawler/sources/PubChem.py b/FourmiCrawler/sources/PubChem.py
index d34a2cb..0ce727f 100644
--- a/FourmiCrawler/sources/PubChem.py
+++ b/FourmiCrawler/sources/PubChem.py
@@ -19,12 +19,10 @@ class PubChem(Source):
 
 
     website = 'https://www.ncbi.nlm.nih.gov/*'
-
-
     search = 'pccompound?term=%s'
 
     __spider = None
-    searched_compounds = []
+    searched_compounds = set()
 
     def __init__(self):
         Source.__init__(self)
@@ -34,12 +32,21 @@ class PubChem(Source):
         log.msg('A response from %s just arrived!' % response.url, level=log.DEBUG)
         sel = Selector(response)
         compound = sel.xpath('//h1/text()').extract()[0]
+        raw_synonyms = sel.xpath('//div[@class="smalltext"]/text()').extract()[0]
+        for synonym in raw_synonyms.strip().split(', '):
+            log.msg('PubChem synonym found: %s' % synonym, level=log.DEBUG)
+            self.searched_compounds.update(synonym)
+            self._spider.get_synonym_requests(synonym)
+
+
+        log.msg('Raw synonyms found: %s' % raw_synonyms, level=log.DEBUG)
+
         if compound in self.searched_compounds:
             return None
         else:
             # items = self.parse_properties(sel)
             items = []
-            self.searched_compounds.append(compound)
+            self.searched_compounds.update(compound)
             print items
             return items
 

From fb41d772f203b420784582732ea64fd45d96c51d Mon Sep 17 00:00:00 2001
From: Nout van Deijck <nout@noutweb.com>
Date: Wed, 21 May 2014 16:11:02 +0200
Subject: [PATCH 04/13] Added custom user-agent because otherwise it would
 block, because not amused by scraper

---
 FourmiCrawler/settings.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/FourmiCrawler/settings.py b/FourmiCrawler/settings.py
index be91fef..490a3a5 100644
--- a/FourmiCrawler/settings.py
+++ b/FourmiCrawler/settings.py
@@ -16,6 +16,8 @@ ITEM_PIPELINES = {
 FEED_URI = 'results.json'
 FEED_FORMAT = 'jsonlines'
 
+USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.137 Safari/537.36'
+
 
 # Crawl responsibly by identifying yourself (and your website) on the
 # user-agent

From 8083d0c7bc03459de2aab224a811653389aa0ebf Mon Sep 17 00:00:00 2001
From: Nout van Deijck <nout@noutweb.com>
Date: Wed, 21 May 2014 16:11:48 +0200
Subject: [PATCH 05/13] PubChem scrapes synonyms, gets custom url to get data
 on properties from

---
 FourmiCrawler/sources/PubChem.py | 40 ++++++++++++++++++++++----------
 1 file changed, 28 insertions(+), 12 deletions(-)

diff --git a/FourmiCrawler/sources/PubChem.py b/FourmiCrawler/sources/PubChem.py
index 0ce727f..e2dcc8b 100644
--- a/FourmiCrawler/sources/PubChem.py
+++ b/FourmiCrawler/sources/PubChem.py
@@ -18,8 +18,11 @@ class PubChem(Source):
     # website = "https://pubchem.ncbi.nlm.nih.gov/toc/summary_toc.cgi?tocid=27&cid=297"  #contains properties to parse
 
 
-    website = 'https://www.ncbi.nlm.nih.gov/*'
+    website = 'https://*.ncbi.nlm.nih.gov/*'
+    website_www = 'https://www.ncbi.nlm.nih.gov/*'
+    website_pubchem = 'https://pubchem.ncbi.nlm.nih.gov/*'
     search = 'pccompound?term=%s'
+    data_url = 'toc/summary_toc.cgi?tocid=27&cid=%s'
 
     __spider = None
     searched_compounds = set()
@@ -29,26 +32,39 @@ class PubChem(Source):
 
     def parse(self, response):
         """ Distributes the above described behaviour """
+        requests = []
         log.msg('A response from %s just arrived!' % response.url, level=log.DEBUG)
+
         sel = Selector(response)
         compound = sel.xpath('//h1/text()').extract()[0]
+        if compound in self.searched_compounds:
+            return None
+
+        self.searched_compounds.update(compound)
         raw_synonyms = sel.xpath('//div[@class="smalltext"]/text()').extract()[0]
         for synonym in raw_synonyms.strip().split(', '):
             log.msg('PubChem synonym found: %s' % synonym, level=log.DEBUG)
             self.searched_compounds.update(synonym)
             self._spider.get_synonym_requests(synonym)
-
-
         log.msg('Raw synonyms found: %s' % raw_synonyms, level=log.DEBUG)
 
-        if compound in self.searched_compounds:
-            return None
-        else:
-            # items = self.parse_properties(sel)
-            items = []
-            self.searched_compounds.update(compound)
-            print items
-            return items
+        n = re.search(r'cid=(\d+)',response.url)
+        if n:
+            cid = n.group(1)
+        log.msg('cid: %s' % cid, level=log.DEBUG)
+        requests.append(Request(url=self.website_pubchem[:-1] + self.data_url % cid, callback=self.parse_data))
+
+        return requests
+
+    def parse_data(self, response):
+        log.msg('parsing data', level=log.DEBUG)
+        requests = []
+
+
+
+
+        return requests
+
 
     def parse_properties(self, sel):
         """ scrape data from 'Chemical and Physical Properties' box on PubChem. """
@@ -83,7 +99,7 @@ class PubChem(Source):
         return items
 
     def new_compound_request(self, compound):
-        return Request(url=self.website[:-1] + self.search % compound, callback=self.parse)
+        return Request(url=self.website_www[:-1] + self.search % compound, callback=self.parse)
 
     # @staticmethod
     # def clean_items(items):

From ba8f8451786088c12b4645f61261ab4e8d96598b Mon Sep 17 00:00:00 2001
From: Nout van Deijck <nout@noutweb.com>
Date: Mon, 2 Jun 2014 09:26:36 +0200
Subject: [PATCH 06/13] now also (finally) scrapes property values and names,
 but not yet coupled together and not yet returned.

---
 FourmiCrawler/sources/PubChem.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/FourmiCrawler/sources/PubChem.py b/FourmiCrawler/sources/PubChem.py
index e2dcc8b..6718900 100644
--- a/FourmiCrawler/sources/PubChem.py
+++ b/FourmiCrawler/sources/PubChem.py
@@ -60,12 +60,20 @@ class PubChem(Source):
         log.msg('parsing data', level=log.DEBUG)
         requests = []
 
+        sel = Selector(response)
+        # props = sel.xpath('.//div')
+        prop_values = sel.xpath('//div//a/text()').extract()
+        prop_names = sel.xpath('//div//a/ancestor::div/b/text()').extract()
 
+        print prop_values
+        print prop_names
 
+        # print props
 
         return requests
 
 
+    # this (old) definition is only here to help myself
     def parse_properties(self, sel):
         """ scrape data from 'Chemical and Physical Properties' box on PubChem. """
         items = []
@@ -95,9 +103,9 @@ class PubChem(Source):
         items = filter(lambda a: a['value'] != '', items)  # remove items with an empty value
         # item_list = self.clean_items(items)
 
-
         return items
 
+
     def new_compound_request(self, compound):
         return Request(url=self.website_www[:-1] + self.search % compound, callback=self.parse)
 

From 291547a5addfb5f79dd8bcc0cb80c798f20f05db Mon Sep 17 00:00:00 2001
From: Nout van Deijck <nout@noutweb.com>
Date: Wed, 4 Jun 2014 15:44:53 +0200
Subject: [PATCH 07/13] now returns good results, with property values and
 corresponding sources

---
 FourmiCrawler/sources/PubChem.py | 34 +++++++++++++++++++++++++-------
 1 file changed, 27 insertions(+), 7 deletions(-)

diff --git a/FourmiCrawler/sources/PubChem.py b/FourmiCrawler/sources/PubChem.py
index 6718900..1d20231 100644
--- a/FourmiCrawler/sources/PubChem.py
+++ b/FourmiCrawler/sources/PubChem.py
@@ -61,14 +61,34 @@ class PubChem(Source):
         requests = []
 
         sel = Selector(response)
-        # props = sel.xpath('.//div')
-        prop_values = sel.xpath('//div//a/text()').extract()
-        prop_names = sel.xpath('//div//a/ancestor::div/b/text()').extract()
+        props = sel.xpath('//div')
 
-        print prop_values
-        print prop_names
-
-        # print props
+        for prop in props:
+            prop_name = ''.join(prop.xpath('b/text()').extract())
+            if prop.xpath('a'):
+                prop_source = ''.join(prop.xpath('a/@title').extract())
+                prop_value = ''.join(prop.xpath('a/text()').extract())
+                new_prop = Result({
+                    'attribute': prop_name,
+                    'value': prop_value,
+                    'source': prop_source,
+                    'reliability': 'Unknown',
+                    'conditions': ''
+                })
+                requests.append(new_prop)
+            elif prop.xpath('ul'):
+                prop_values = prop.xpath('ul//li')
+                for prop_li in prop_values:
+                    prop_value = ''.join(prop_li.xpath('a/text()').extract())
+                    prop_source = ''.join(prop_li.xpath('a/@title').extract())
+                    new_prop = Result({
+                        'attribute': prop_name,
+                        'value': prop_value,
+                        'source': prop_source,
+                        'reliability': 'Unknown',
+                        'conditions': ''
+                    })
+                    requests.append(new_prop)
 
         return requests
 

From f1047405667c789b1a1c4238ae84eeac10834cfe Mon Sep 17 00:00:00 2001
From: Nout van Deijck <nout@noutweb.com>
Date: Wed, 11 Jun 2014 16:39:00 +0200
Subject: [PATCH 08/13] cleaned up useless code

---
 FourmiCrawler/sources/PubChem.py | 54 +-------------------------------
 1 file changed, 1 insertion(+), 53 deletions(-)

diff --git a/FourmiCrawler/sources/PubChem.py b/FourmiCrawler/sources/PubChem.py
index 1d20231..6490b20 100644
--- a/FourmiCrawler/sources/PubChem.py
+++ b/FourmiCrawler/sources/PubChem.py
@@ -12,12 +12,6 @@ class PubChem(Source):
         This parser parses the part on PubChem pages that gives Chemical and Physical properties of a substance.
     """
 
-    # TO DO: make url variable with help of PubChem identifier ID / cid
-
-    #website = "https://pubchem.ncbi.nlm.nih.gov/summary/summary.cgi?cid=297"            #contains name of compound but not all parsable data
-    # website = "https://pubchem.ncbi.nlm.nih.gov/toc/summary_toc.cgi?tocid=27&cid=297"  #contains properties to parse
-
-
     website = 'https://*.ncbi.nlm.nih.gov/*'
     website_www = 'https://www.ncbi.nlm.nih.gov/*'
     website_pubchem = 'https://pubchem.ncbi.nlm.nih.gov/*'
@@ -93,51 +87,5 @@ class PubChem(Source):
         return requests
 
 
-    # this (old) definition is only here to help myself
-    def parse_properties(self, sel):
-        """ scrape data from 'Chemical and Physical Properties' box on PubChem. """
-        items = []
-
-
-        prop_names = sel.xpath('.//div[@id="d27"//div/b').\
-            xpath('normalize-space(string())')
-        prop_values = sel.xpath('.//div[@id="d27"//div/a').\
-            xpath('normalize-space(string())')
-        prop_sources = sel.xpath('.//div[@id="d27"//div/a[@title]').\
-            xpath('normalize-space(string())')
-
-        for i, prop_name in enumerate(prop_names):
-            item = Result({
-                'attribute': prop_name.extract().encode('utf-8'),
-                'value': prop_values[i].extract().encode('utf-8'),
-                'source': "PubChem: " + prop_sources[i].extract().encode('utf-8'),
-                'reliability': "",
-                'conditions': ""
-            })
-            items.append(item)
-
-            print item
-
-            log.msg('PubChem prop: |%s| |%s| |%s|' % (item['attribute'], item['value'], item['source']), level=log.DEBUG)
-
-        items = filter(lambda a: a['value'] != '', items)  # remove items with an empty value
-        # item_list = self.clean_items(items)
-
-        return items
-
-
     def new_compound_request(self, compound):
-        return Request(url=self.website_www[:-1] + self.search % compound, callback=self.parse)
-
-    # @staticmethod
-    # def clean_items(items):
-    #     """ clean up properties using regex, makes it possible to split the values from the units """
-    #     for item in items:
-    #         value = item['value']
-    #         m = re.search('F;\s(\d+[\.,]?\d*)', value)  # clean up numerical Kelvin value (after F)
-    #         if m:
-    #             item['value'] = m.group(1) + " K"
-    #         m = re.match('(\d+[\.,]?\d*)\sJ\sK.+mol', value)  # clean up J/K/mol values
-    #         if m:
-    #             item['value'] = m.group(1) + " J/K/mol"
-    #     return items
+        return Request(url=self.website_www[:-1] + self.search % compound, callback=self.parse)
\ No newline at end of file

From a903e78f9ebe4f855c9ffc0d74ce4faa95831c4f Mon Sep 17 00:00:00 2001
From: Nout van Deijck <nout@noutweb.com>
Date: Wed, 11 Jun 2014 16:40:32 +0200
Subject: [PATCH 09/13] added PubChem to sources.cfg

---
 sources.cfg | 15 +++++++++++++++
 1 file changed, 15 insertions(+)
 create mode 100644 sources.cfg

diff --git a/sources.cfg b/sources.cfg
new file mode 100644
index 0000000..a9fa2fb
--- /dev/null
+++ b/sources.cfg
@@ -0,0 +1,15 @@
+[DEFAULT]
+reliability = Unknown
+
+[ChemSpider]
+reliability = High
+token = 052bfd06-5ce4-43d6-bf12-89eabefd2338
+
+[NIST]
+reliability = High
+
+[WikipediaParser]
+reliability = Medium
+
+[PubChem]
+reliability = High
\ No newline at end of file

From 8836cdf16b758b86bc1e20402b85b2c3d4b11990 Mon Sep 17 00:00:00 2001
From: RTB <robtberge@gmail.com>
Date: Wed, 11 Jun 2014 18:39:01 +0200
Subject: [PATCH 10/13] fixed config errors due to merge with develop

---
 FourmiCrawler/sources/PubChem.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/FourmiCrawler/sources/PubChem.py b/FourmiCrawler/sources/PubChem.py
index 6490b20..ab6a99e 100644
--- a/FourmiCrawler/sources/PubChem.py
+++ b/FourmiCrawler/sources/PubChem.py
@@ -21,8 +21,9 @@ class PubChem(Source):
     __spider = None
     searched_compounds = set()
 
-    def __init__(self):
-        Source.__init__(self)
+    def __init__(self, config):
+        Source.__init__(self, config)
+        self.cfg = config
 
     def parse(self, response):
         """ Distributes the above described behaviour """
@@ -88,4 +89,4 @@ class PubChem(Source):
 
 
     def new_compound_request(self, compound):
-        return Request(url=self.website_www[:-1] + self.search % compound, callback=self.parse)
\ No newline at end of file
+        return Request(url=self.website_www[:-1] + self.search % compound, callback=self.parse)

From 4dc557d9e8e7bb5ac529e0201f577e23aeca29cb Mon Sep 17 00:00:00 2001
From: Nout van Deijck <nout@noutweb.com>
Date: Tue, 17 Jun 2014 00:09:17 +0200
Subject: [PATCH 11/13] Finish plugin (comments, log messages, etc)

---
 FourmiCrawler/sources/PubChem.py | 33 +++++++++++++++++++++++++-------
 1 file changed, 26 insertions(+), 7 deletions(-)

diff --git a/FourmiCrawler/sources/PubChem.py b/FourmiCrawler/sources/PubChem.py
index ab6a99e..fc8250b 100644
--- a/FourmiCrawler/sources/PubChem.py
+++ b/FourmiCrawler/sources/PubChem.py
@@ -9,9 +9,11 @@ import re
 class PubChem(Source):
     """ PubChem scraper for chemical properties
 
-        This parser parses the part on PubChem pages that gives Chemical and Physical properties of a substance.
+        This parser parses the part on PubChem pages that gives Chemical and Physical properties of a substance,
+        including sources of the values of properties.
     """
 
+    #PubChem has its data on compound name, properties and their values on different html pages, so different URLs used
     website = 'https://*.ncbi.nlm.nih.gov/*'
     website_www = 'https://www.ncbi.nlm.nih.gov/*'
     website_pubchem = 'https://pubchem.ncbi.nlm.nih.gov/*'
@@ -26,7 +28,11 @@ class PubChem(Source):
         self.cfg = config
 
     def parse(self, response):
-        """ Distributes the above described behaviour """
+        """
+        Distributes the above described behaviour
+        :param response: The incoming search request
+        :return Returns the found properties if response is unique or returns none if it's already known
+        """
         requests = []
         log.msg('A response from %s just arrived!' % response.url, level=log.DEBUG)
 
@@ -46,12 +52,19 @@ class PubChem(Source):
         n = re.search(r'cid=(\d+)',response.url)
         if n:
             cid = n.group(1)
-        log.msg('cid: %s' % cid, level=log.DEBUG)
-        requests.append(Request(url=self.website_pubchem[:-1] + self.data_url % cid, callback=self.parse_data))
+        log.msg('cid: %s' % cid, level=log.DEBUG)   #getting the right id of the compound with which it can reach
+                                                # the seperate html page which contains the properties and their values
 
+        #using this cid to get the right url and scrape it
+        requests.append(Request(url=self.website_pubchem[:-1] + self.data_url % cid, callback=self.parse_data))
         return requests
 
     def parse_data(self, response):
+        """
+        Parse data found in 'Chemical and Physical properties' part of a substance page.
+        :param response: The response with the page to parse
+        :return: requests: Returns a list of properties with their values, source, etc.
+        """
         log.msg('parsing data', level=log.DEBUG)
         requests = []
 
@@ -59,8 +72,8 @@ class PubChem(Source):
         props = sel.xpath('//div')
 
         for prop in props:
-            prop_name = ''.join(prop.xpath('b/text()').extract())
-            if prop.xpath('a'):
+            prop_name = ''.join(prop.xpath('b/text()').extract()) # name of property that it is parsing
+            if prop.xpath('a'):     # parsing for single value in property
                 prop_source = ''.join(prop.xpath('a/@title').extract())
                 prop_value = ''.join(prop.xpath('a/text()').extract())
                 new_prop = Result({
@@ -70,8 +83,11 @@ class PubChem(Source):
                     'reliability': 'Unknown',
                     'conditions': ''
                 })
+                log.msg('PubChem prop: |%s| |%s| |%s|' %
+                        (new_prop['attribute'], new_prop['value'],
+                         new_prop['source']), level=log.DEBUG)
                 requests.append(new_prop)
-            elif prop.xpath('ul'):
+            elif prop.xpath('ul'):    # parsing for multiple values (list) in property
                 prop_values = prop.xpath('ul//li')
                 for prop_li in prop_values:
                     prop_value = ''.join(prop_li.xpath('a/text()').extract())
@@ -83,6 +99,9 @@ class PubChem(Source):
                         'reliability': 'Unknown',
                         'conditions': ''
                     })
+                    log.msg('PubChem prop: |%s| |%s| |%s|' %
+                        (new_prop['attribute'], new_prop['value'],
+                         new_prop['source']), level=log.DEBUG)
                     requests.append(new_prop)
 
         return requests

From 56e1d3cfb6a785b3a2b444a93eeca2fb02b2be88 Mon Sep 17 00:00:00 2001
From: "Jip J. Dekker" <jip@dekker.li>
Date: Tue, 17 Jun 2014 00:28:01 +0200
Subject: [PATCH 12/13] No cofig files should be included on github

---
 sources.cfg | 15 ---------------
 1 file changed, 15 deletions(-)
 delete mode 100644 sources.cfg

diff --git a/sources.cfg b/sources.cfg
deleted file mode 100644
index a9fa2fb..0000000
--- a/sources.cfg
+++ /dev/null
@@ -1,15 +0,0 @@
-[DEFAULT]
-reliability = Unknown
-
-[ChemSpider]
-reliability = High
-token = 052bfd06-5ce4-43d6-bf12-89eabefd2338
-
-[NIST]
-reliability = High
-
-[WikipediaParser]
-reliability = Medium
-
-[PubChem]
-reliability = High
\ No newline at end of file

From 6e16e9f23e19016ac5a5d3eff3dd4e07cdf9e8c8 Mon Sep 17 00:00:00 2001
From: "Jip J. Dekker" <jip@dekker.li>
Date: Tue, 17 Jun 2014 00:33:08 +0200
Subject: [PATCH 13/13] TODO on sppofing user agent

---
 FourmiCrawler/settings.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/FourmiCrawler/settings.py b/FourmiCrawler/settings.py
index 320f573..338f224 100644
--- a/FourmiCrawler/settings.py
+++ b/FourmiCrawler/settings.py
@@ -18,10 +18,10 @@ ITEM_PIPELINES = {
 FEED_URI = 'results.json'
 FEED_FORMAT = 'jsonlines'
 
-USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.137 Safari/537.36'
-
-
 # Crawl responsibly by identifying yourself (and your website) on the
 # user-agent
 
+# [todo] - Check for repercussions on spoofing the user agent
+
 # USER_AGENT = 'FourmiCrawler (+http://www.yourdomain.com)'
+USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.137 Safari/537.36'