From 2cefcfdb133402f16f38ac9548e69e50e7cc3175 Mon Sep 17 00:00:00 2001
From: RTB <robtberge@gmail.com>
Date: Thu, 19 Jun 2014 12:46:09 +0200
Subject: [PATCH 1/9] made parse_searchrequest function to parse search page
 and modified new_compound_request accordingly

---
 FourmiCrawler/sources/PubChem.py | 37 +++++++++++++++++++++++++++++++-
 1 file changed, 36 insertions(+), 1 deletion(-)

diff --git a/FourmiCrawler/sources/PubChem.py b/FourmiCrawler/sources/PubChem.py
index fc8250b..08f8347 100644
--- a/FourmiCrawler/sources/PubChem.py
+++ b/FourmiCrawler/sources/PubChem.py
@@ -106,6 +106,41 @@ class PubChem(Source):
 
         return requests
 
+    def parse_searchrequest(self, response):
+        """
+        This function parses the response to the new_compound_request Request
+        :param response: the Response object to be parsed
+        :return: A Request for the compound page or what self.parse returns in
+                 case the search request forwarded to the compound page
+        """
+
+        #check if pubchem forwarded straight to compound page
+        m = re.match(self.website_pubchem, response.url)
+        if m:
+            log.msg('PubChem search forwarded to compound page',
+                    level=log.DEBUG)
+            return self.parse(response)
+
+        sel = Selector(response)
+
+        results = sel.xpath('//div[@class="rsltcont"]')
+        if results:
+            url = results[0].xpath('div/p/a[1]/@href')
+        else:
+            log.msg('PubChem search found nothing or xpath failed',
+                    level=log.DEBUG)
+            return None
+
+        if url:
+            url = 'http:' + ''.join(url[0].extract())
+            log.msg('PubChem compound page: %s' % url, level=log.DEBUG)
+        else:
+            log.msg('PubChem search found results, but no url in first result',
+                    level=log.DEBUG)
+            return None
+
+        return Request(url=url, callback=self.parse)
 
     def new_compound_request(self, compound):
-        return Request(url=self.website_www[:-1] + self.search % compound, callback=self.parse)
+        return Request(url=self.website_www[:-1] + self.search % compound,
+                       callback=self.parse_searchrequest)

From 1fb8450367e2cb8640f0c7c4a3eb069be09330ec Mon Sep 17 00:00:00 2001
From: "Jip J. Dekker" <jip@dekker.li>
Date: Thu, 19 Jun 2014 21:05:17 +0200
Subject: [PATCH 2/9] The cool folder seperators!

---
 utils/configurator.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/utils/configurator.py b/utils/configurator.py
index b443529..358adc7 100644
--- a/utils/configurator.py
+++ b/utils/configurator.py
@@ -1,7 +1,7 @@
 import ConfigParser
+import os
 
 from scrapy.utils.project import get_project_settings
-import os
 
 class Configurator:
     """
@@ -67,7 +67,7 @@ class Configurator:
         :return a ConfigParser object of sources.cfg
         """
         current_dir = os.path.dirname(os.path.abspath(__file__))
-        config_path = current_dir + '\..\sources.cfg'
+        config_path = current_dir + '/../sources.cfg'
         # [TODO]: location of sources.cfg should be softcoded eventually
         config = ConfigParser.ConfigParser()
         config.read(config_path)

From 576683dcd0376440c04f483b820aeb7762dade27 Mon Sep 17 00:00:00 2001
From: "Jip J. Dekker" <jip@dekker.li>
Date: Thu, 19 Jun 2014 22:01:35 +0200
Subject: [PATCH 3/9] These regular expressions where all wrong

---
 FourmiCrawler/sources/ChemSpider.py      |  7 ++++---
 FourmiCrawler/sources/NIST.py            |  4 ++--
 FourmiCrawler/sources/PubChem.py         | 12 +++++++-----
 FourmiCrawler/sources/WikipediaParser.py |  4 ++--
 FourmiCrawler/sources/source.py          |  4 ++--
 5 files changed, 17 insertions(+), 14 deletions(-)

diff --git a/FourmiCrawler/sources/ChemSpider.py b/FourmiCrawler/sources/ChemSpider.py
index 5920b85..23b25fe 100644
--- a/FourmiCrawler/sources/ChemSpider.py
+++ b/FourmiCrawler/sources/ChemSpider.py
@@ -1,3 +1,5 @@
+import re
+
 from scrapy import log
 from scrapy.http import Request
 from scrapy.selector import Selector
@@ -5,7 +7,6 @@ from scrapy.selector import Selector
 from source import Source
 from FourmiCrawler.items import Result
 
-import re
 
 # [TODO] - Maybe clean up usage of '.extract()[0]', because of possible IndexError exception.
 
@@ -18,7 +19,7 @@ class ChemSpider(Source):
     somewhere.
     """
 
-    website = 'http://www.chemspider.com/*'
+    website = 'http://www\.chemspider\.com/.*'
 
     search = 'Search.asmx/SimpleSearch?query=%s&token='
     structure = 'Chemical-Structure.%s.html'
@@ -292,6 +293,6 @@ class ChemSpider(Source):
         """
         if compound in self.ignore_list or self.cfg['token'] == '':
             return None
-        searchurl = self.website[:-1] + self.search % compound
+        searchurl = self.website[:-2] + self.search % compound
         log.msg('chemspider compound', level=log.DEBUG)
         return Request(url=searchurl, callback=self.parse_searchrequest)
diff --git a/FourmiCrawler/sources/NIST.py b/FourmiCrawler/sources/NIST.py
index c136b80..904df80 100644
--- a/FourmiCrawler/sources/NIST.py
+++ b/FourmiCrawler/sources/NIST.py
@@ -18,7 +18,7 @@ class NIST(Source):
     This plugin manages searching for a chemical on the NIST website
     and parsing the resulting page if the chemical exists on NIST.
     """
-    website = "http://webbook.nist.gov/*"
+    website = "http://webbook\.nist\.gov/.*"
 
     search = 'cgi/cbook.cgi?Name=%s&Units=SI&cTP=on'
 
@@ -329,5 +329,5 @@ class NIST(Source):
         """
         if compound not in self.ignore_list:
             self.ignore_list.update(compound)
-            return Request(url=self.website[:-1] + self.search % compound,
+            return Request(url=self.website[:-2] + self.search % compound,
                            callback=self.parse)
diff --git a/FourmiCrawler/sources/PubChem.py b/FourmiCrawler/sources/PubChem.py
index 08f8347..521b02d 100644
--- a/FourmiCrawler/sources/PubChem.py
+++ b/FourmiCrawler/sources/PubChem.py
@@ -1,9 +1,11 @@
+import re
+
 from scrapy.http import Request
 from scrapy import log
-from source import Source
 from scrapy.selector import Selector
+
+from source import Source
 from FourmiCrawler.items import Result
-import re
 
 
 class PubChem(Source):
@@ -14,9 +16,9 @@ class PubChem(Source):
     """
 
     #PubChem has its data on compound name, properties and their values on different html pages, so different URLs used
-    website = 'https://*.ncbi.nlm.nih.gov/*'
-    website_www = 'https://www.ncbi.nlm.nih.gov/*'
-    website_pubchem = 'https://pubchem.ncbi.nlm.nih.gov/*'
+    website = 'https://.*\.ncbi\.nlm\.nih\.gov/.*'
+    website_www = 'https://www.ncbi.nlm.nih.gov/.*'
+    website_pubchem = 'https://pubchem.ncbi.nlm.nih.gov/.*'
     search = 'pccompound?term=%s'
     data_url = 'toc/summary_toc.cgi?tocid=27&cid=%s'
 
diff --git a/FourmiCrawler/sources/WikipediaParser.py b/FourmiCrawler/sources/WikipediaParser.py
index 401698c..385311c 100644
--- a/FourmiCrawler/sources/WikipediaParser.py
+++ b/FourmiCrawler/sources/WikipediaParser.py
@@ -15,7 +15,7 @@ class WikipediaParser(Source):
     It also returns requests with other external sources which contain information on parsed subject.
     """
 
-    website = "http://en.wikipedia.org/wiki/*"
+    website = "http://en\.wikipedia\.org/wiki/.*"
     __spider = None
     searched_compounds = []
 
@@ -123,7 +123,7 @@ class WikipediaParser(Source):
         return items
 
     def new_compound_request(self, compound):
-        return Request(url=self.website[:-1] + compound, callback=self.parse)
+        return Request(url=self.website[:-2] + compound, callback=self.parse)
 
     @staticmethod
     def clean_items(items):
diff --git a/FourmiCrawler/sources/source.py b/FourmiCrawler/sources/source.py
index 36218b0..3ffb47d 100644
--- a/FourmiCrawler/sources/source.py
+++ b/FourmiCrawler/sources/source.py
@@ -3,7 +3,7 @@ from scrapy import log
 
 
 class Source:
-    website = "http://something/*"  # Regex of URI's the source is able to parse
+    website = "http://something/.*"  # Regex of URI's the source is able to parse
     _spider = None
 
     def __init__(self, config=None):
@@ -30,7 +30,7 @@ class Source:
         :param compound: A compound name.
         :return: A new Scrapy Request
         """
-        # return Request(url=self.website[:-1] + compound, callback=self.parse)
+        # return Request(url=self.website[:-2] + compound, callback=self.parse)
         pass
 
     def set_spider(self, spider):

From ef1c3193966e9e64f53e5cb5af8ec17791f37aae Mon Sep 17 00:00:00 2001
From: "Jip J. Dekker" <jip@dekker.li>
Date: Thu, 19 Jun 2014 22:05:21 +0200
Subject: [PATCH 4/9] Escape escape characters

---
 FourmiCrawler/settings.py                | 4 ++--
 FourmiCrawler/sources/ChemSpider.py      | 4 ++--
 FourmiCrawler/sources/NIST.py            | 4 ++--
 FourmiCrawler/sources/PubChem.py         | 4 ++--
 FourmiCrawler/sources/WikipediaParser.py | 4 ++--
 FourmiCrawler/sources/source.py          | 2 +-
 6 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/FourmiCrawler/settings.py b/FourmiCrawler/settings.py
index 338f224..ace60ab 100644
--- a/FourmiCrawler/settings.py
+++ b/FourmiCrawler/settings.py
@@ -23,5 +23,5 @@ FEED_FORMAT = 'jsonlines'
 
 # [todo] - Check for repercussions on spoofing the user agent
 
-# USER_AGENT = 'FourmiCrawler (+http://www.yourdomain.com)'
-USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.137 Safari/537.36'
+USER_AGENT = 'Fourmi'
+# USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.137 Safari/537.36'
diff --git a/FourmiCrawler/sources/ChemSpider.py b/FourmiCrawler/sources/ChemSpider.py
index 23b25fe..6ca5382 100644
--- a/FourmiCrawler/sources/ChemSpider.py
+++ b/FourmiCrawler/sources/ChemSpider.py
@@ -19,7 +19,7 @@ class ChemSpider(Source):
     somewhere.
     """
 
-    website = 'http://www\.chemspider\.com/.*'
+    website = 'http://www\\.chemspider\\.com/.*'
 
     search = 'Search.asmx/SimpleSearch?query=%s&token='
     structure = 'Chemical-Structure.%s.html'
@@ -293,6 +293,6 @@ class ChemSpider(Source):
         """
         if compound in self.ignore_list or self.cfg['token'] == '':
             return None
-        searchurl = self.website[:-2] + self.search % compound
+        searchurl = self.website[:-2].replace("\\", "") + self.search % compound
         log.msg('chemspider compound', level=log.DEBUG)
         return Request(url=searchurl, callback=self.parse_searchrequest)
diff --git a/FourmiCrawler/sources/NIST.py b/FourmiCrawler/sources/NIST.py
index 904df80..4ad93f5 100644
--- a/FourmiCrawler/sources/NIST.py
+++ b/FourmiCrawler/sources/NIST.py
@@ -18,7 +18,7 @@ class NIST(Source):
     This plugin manages searching for a chemical on the NIST website
     and parsing the resulting page if the chemical exists on NIST.
     """
-    website = "http://webbook\.nist\.gov/.*"
+    website = "http://webbook\\.nist\\.gov/.*"
 
     search = 'cgi/cbook.cgi?Name=%s&Units=SI&cTP=on'
 
@@ -329,5 +329,5 @@ class NIST(Source):
         """
         if compound not in self.ignore_list:
             self.ignore_list.update(compound)
-            return Request(url=self.website[:-2] + self.search % compound,
+            return Request(url=self.website[:-2].replace("\\", "") + self.search % compound,
                            callback=self.parse)
diff --git a/FourmiCrawler/sources/PubChem.py b/FourmiCrawler/sources/PubChem.py
index 521b02d..5947e54 100644
--- a/FourmiCrawler/sources/PubChem.py
+++ b/FourmiCrawler/sources/PubChem.py
@@ -16,8 +16,8 @@ class PubChem(Source):
     """
 
     #PubChem has its data on compound name, properties and their values on different html pages, so different URLs used
-    website = 'https://.*\.ncbi\.nlm\.nih\.gov/.*'
-    website_www = 'https://www.ncbi.nlm.nih.gov/.*'
+    website = 'https://.*\\.ncbi\\.nlm\\.nih\\.gov/.*'
+    website_www = 'https://www.ncbi.nlm.nih.gov/*'
     website_pubchem = 'https://pubchem.ncbi.nlm.nih.gov/.*'
     search = 'pccompound?term=%s'
     data_url = 'toc/summary_toc.cgi?tocid=27&cid=%s'
diff --git a/FourmiCrawler/sources/WikipediaParser.py b/FourmiCrawler/sources/WikipediaParser.py
index 385311c..e27bb39 100644
--- a/FourmiCrawler/sources/WikipediaParser.py
+++ b/FourmiCrawler/sources/WikipediaParser.py
@@ -15,7 +15,7 @@ class WikipediaParser(Source):
     It also returns requests with other external sources which contain information on parsed subject.
     """
 
-    website = "http://en\.wikipedia\.org/wiki/.*"
+    website = "http://en\\.wikipedia\\.org/wiki/.*"
     __spider = None
     searched_compounds = []
 
@@ -123,7 +123,7 @@ class WikipediaParser(Source):
         return items
 
     def new_compound_request(self, compound):
-        return Request(url=self.website[:-2] + compound, callback=self.parse)
+        return Request(url=self.website[:-2].replace("\\", "") + compound, callback=self.parse)
 
     @staticmethod
     def clean_items(items):
diff --git a/FourmiCrawler/sources/source.py b/FourmiCrawler/sources/source.py
index 3ffb47d..a0d3dcd 100644
--- a/FourmiCrawler/sources/source.py
+++ b/FourmiCrawler/sources/source.py
@@ -30,7 +30,7 @@ class Source:
         :param compound: A compound name.
         :return: A new Scrapy Request
         """
-        # return Request(url=self.website[:-2] + compound, callback=self.parse)
+        # return Request(url=self.website[:-2].replace("\\", "") + compound, callback=self.parse)
         pass
 
     def set_spider(self, spider):

From 27529c414f18c3332407288bd01c03c0cea68c24 Mon Sep 17 00:00:00 2001
From: "Jip J. Dekker" <jip@dekker.li>
Date: Thu, 19 Jun 2014 22:06:55 +0200
Subject: [PATCH 5/9] Fourmi as our USER_AGENT

---
 FourmiCrawler/settings.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/FourmiCrawler/settings.py b/FourmiCrawler/settings.py
index ace60ab..e82c8e6 100644
--- a/FourmiCrawler/settings.py
+++ b/FourmiCrawler/settings.py
@@ -21,7 +21,4 @@ FEED_FORMAT = 'jsonlines'
 # Crawl responsibly by identifying yourself (and your website) on the
 # user-agent
 
-# [todo] - Check for repercussions on spoofing the user agent
-
 USER_AGENT = 'Fourmi'
-# USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.137 Safari/537.36'

From a3e973ecadebb963a645318008f92e949d50dfbf Mon Sep 17 00:00:00 2001
From: "Jip J. Dekker" <jip@dekker.li>
Date: Thu, 19 Jun 2014 22:08:45 +0200
Subject: [PATCH 6/9] Added INFO message when no compatible source on response

---
 FourmiCrawler/spider.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/FourmiCrawler/spider.py b/FourmiCrawler/spider.py
index ebfd2cf..32181ce 100644
--- a/FourmiCrawler/spider.py
+++ b/FourmiCrawler/spider.py
@@ -34,8 +34,9 @@ class FourmiSpider(Spider):
         """
         for source in self._sources:
             if re.match(source.website, response.url):
-                log.msg("Url: " + response.url + " -> Source: " + source.website, level=log.DEBUG)
+                log.msg("URL: " + response.url + " -> Source: " + source.website, level=log.DEBUG)
                 return source.parse(response)
+        log.msg("URL: " + response.url + " -> No compatible source", level=log.INFO)
         return None
 
     def get_synonym_requests(self, compound, force=False):

From 093eba8b0469a5223911f3f503db493e775c6992 Mon Sep 17 00:00:00 2001
From: "Jip J. Dekker" <jip@dekker.li>
Date: Thu, 19 Jun 2014 22:26:16 +0200
Subject: [PATCH 7/9] Other occurences of website REGEX

---
 FourmiCrawler/sources/ChemSpider.py | 4 ++--
 FourmiCrawler/sources/NIST.py       | 2 +-
 FourmiCrawler/sources/PubChem.py    | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/FourmiCrawler/sources/ChemSpider.py b/FourmiCrawler/sources/ChemSpider.py
index 6ca5382..b4bf6f0 100644
--- a/FourmiCrawler/sources/ChemSpider.py
+++ b/FourmiCrawler/sources/ChemSpider.py
@@ -277,8 +277,8 @@ class ChemSpider(Source):
             log.msg('ChemSpider found multiple substances, taking first '
                     'element', level=log.DEBUG)
         csid = csids[0]
-        structure_url = self.website[:-1] + self.structure % csid
-        extendedinfo_url = self.website[:-1] + self.extendedinfo % csid
+        structure_url = self.website[:-2].replace("\\", "") + self.structure % csid
+        extendedinfo_url = self.website[:-2].replace("\\", "") + self.extendedinfo % csid
         log.msg('chemspider URL: %s' % structure_url, level=log.DEBUG)
         return [Request(url=structure_url,
                         callback=self.parse),
diff --git a/FourmiCrawler/sources/NIST.py b/FourmiCrawler/sources/NIST.py
index 4ad93f5..691b062 100644
--- a/FourmiCrawler/sources/NIST.py
+++ b/FourmiCrawler/sources/NIST.py
@@ -164,7 +164,7 @@ class NIST(Source):
             extra_data_url = tr.xpath('td[last()][a="Individual data points"]'
                                       '/a/@href').extract()
             if extra_data_url:
-                request = Request(url=self.website[:-1] + extra_data_url[0],
+                request = Request(url=self.website[:-2].replace("\\", "") + extra_data_url[0],
                                   callback=self.parse_individual_datapoints)
                 results.append(request)
                 continue
diff --git a/FourmiCrawler/sources/PubChem.py b/FourmiCrawler/sources/PubChem.py
index 5947e54..0768612 100644
--- a/FourmiCrawler/sources/PubChem.py
+++ b/FourmiCrawler/sources/PubChem.py
@@ -51,7 +51,7 @@ class PubChem(Source):
             self._spider.get_synonym_requests(synonym)
         log.msg('Raw synonyms found: %s' % raw_synonyms, level=log.DEBUG)
 
-        n = re.search(r'cid=(\d+)',response.url)
+        n = re.search(r'cid=(\d+)', response.url)
         if n:
             cid = n.group(1)
         log.msg('cid: %s' % cid, level=log.DEBUG)   #getting the right id of the compound with which it can reach

From d7d2a659b12e351cb246dcb2a49bd4dd43eeb67a Mon Sep 17 00:00:00 2001
From: RTB <robtberge@gmail.com>
Date: Thu, 19 Jun 2014 22:34:53 +0200
Subject: [PATCH 8/9] changed https to http in PubChem.py

---
 FourmiCrawler/sources/PubChem.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/FourmiCrawler/sources/PubChem.py b/FourmiCrawler/sources/PubChem.py
index 0768612..0bc8b8c 100644
--- a/FourmiCrawler/sources/PubChem.py
+++ b/FourmiCrawler/sources/PubChem.py
@@ -16,9 +16,9 @@ class PubChem(Source):
     """
 
     #PubChem has its data on compound name, properties and their values on different html pages, so different URLs used
-    website = 'https://.*\\.ncbi\\.nlm\\.nih\\.gov/.*'
-    website_www = 'https://www.ncbi.nlm.nih.gov/*'
-    website_pubchem = 'https://pubchem.ncbi.nlm.nih.gov/.*'
+    website = 'http://.*\\.ncbi\\.nlm\\.nih\\.gov/.*'
+    website_www = 'http://www.ncbi.nlm.nih.gov/*'
+    website_pubchem = 'http://pubchem.ncbi.nlm.nih.gov/.*'
     search = 'pccompound?term=%s'
     data_url = 'toc/summary_toc.cgi?tocid=27&cid=%s'
 

From 229091520999cf7215120e45f1b719d94dea34e2 Mon Sep 17 00:00:00 2001
From: RTB <robtberge@gmail.com>
Date: Thu, 19 Jun 2014 22:45:01 +0200
Subject: [PATCH 9/9] fixed forgotten self.website usage

---
 FourmiCrawler/sources/PubChem.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/FourmiCrawler/sources/PubChem.py b/FourmiCrawler/sources/PubChem.py
index 0bc8b8c..15fa3f9 100644
--- a/FourmiCrawler/sources/PubChem.py
+++ b/FourmiCrawler/sources/PubChem.py
@@ -58,7 +58,7 @@ class PubChem(Source):
                                                 # the seperate html page which contains the properties and their values
 
         #using this cid to get the right url and scrape it
-        requests.append(Request(url=self.website_pubchem[:-1] + self.data_url % cid, callback=self.parse_data))
+        requests.append(Request(url=self.website_pubchem[:-2].replace("\\","") + self.data_url % cid, callback=self.parse_data))
         return requests
 
     def parse_data(self, response):