Escape escape characters

2014-06-19 22:05:21 +02:00 · 2014-06-19 22:05:21 +02:00 · ef1c319396
commit ef1c319396
parent 576683dcd0
6 changed files with 11 additions and 11 deletions
--- a/FourmiCrawler/settings.py
+++ b/FourmiCrawler/settings.py
@ -23,5 +23,5 @@ FEED_FORMAT = 'jsonlines'

 # [todo] - Check for repercussions on spoofing the user agent

-# USER_AGENT = 'FourmiCrawler (+http://www.yourdomain.com)'
-USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.137 Safari/537.36'
+USER_AGENT = 'Fourmi'
+# USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.137 Safari/537.36'
--- a/FourmiCrawler/sources/ChemSpider.py
+++ b/FourmiCrawler/sources/ChemSpider.py
@ -19,7 +19,7 @@ class ChemSpider(Source):
    somewhere.
    """

-    website = 'http://www\.chemspider\.com/.*'
+    website = 'http://www\\.chemspider\\.com/.*'

    search = 'Search.asmx/SimpleSearch?query=%s&token='
    structure = 'Chemical-Structure.%s.html'
@ -293,6 +293,6 @@ class ChemSpider(Source):
        """
        if compound in self.ignore_list or self.cfg['token'] == '':
            return None
-        searchurl = self.website[:-2] + self.search % compound
+        searchurl = self.website[:-2].replace("\\", "") + self.search % compound
        log.msg('chemspider compound', level=log.DEBUG)
        return Request(url=searchurl, callback=self.parse_searchrequest)
--- a/FourmiCrawler/sources/NIST.py
+++ b/FourmiCrawler/sources/NIST.py
@ -18,7 +18,7 @@ class NIST(Source):
    This plugin manages searching for a chemical on the NIST website
    and parsing the resulting page if the chemical exists on NIST.
    """
-    website = "http://webbook\.nist\.gov/.*"
+    website = "http://webbook\\.nist\\.gov/.*"

    search = 'cgi/cbook.cgi?Name=%s&Units=SI&cTP=on'

@ -329,5 +329,5 @@ class NIST(Source):
        """
        if compound not in self.ignore_list:
            self.ignore_list.update(compound)
-            return Request(url=self.website[:-2] + self.search % compound,
+            return Request(url=self.website[:-2].replace("\\", "") + self.search % compound,
                           callback=self.parse)
--- a/FourmiCrawler/sources/PubChem.py
+++ b/FourmiCrawler/sources/PubChem.py
@ -16,8 +16,8 @@ class PubChem(Source):
    """

    #PubChem has its data on compound name, properties and their values on different html pages, so different URLs used
-    website = 'https://.*\.ncbi\.nlm\.nih\.gov/.*'
-    website_www = 'https://www.ncbi.nlm.nih.gov/.*'
+    website = 'https://.*\\.ncbi\\.nlm\\.nih\\.gov/.*'
+    website_www = 'https://www.ncbi.nlm.nih.gov/*'
    website_pubchem = 'https://pubchem.ncbi.nlm.nih.gov/.*'
    search = 'pccompound?term=%s'
    data_url = 'toc/summary_toc.cgi?tocid=27&cid=%s'
--- a/FourmiCrawler/sources/WikipediaParser.py
+++ b/FourmiCrawler/sources/WikipediaParser.py
@ -15,7 +15,7 @@ class WikipediaParser(Source):
    It also returns requests with other external sources which contain information on parsed subject.
    """

-    website = "http://en\.wikipedia\.org/wiki/.*"
+    website = "http://en\\.wikipedia\\.org/wiki/.*"
    __spider = None
    searched_compounds = []

@ -123,7 +123,7 @@ class WikipediaParser(Source):
        return items

    def new_compound_request(self, compound):
-        return Request(url=self.website[:-2] + compound, callback=self.parse)
+        return Request(url=self.website[:-2].replace("\\", "") + compound, callback=self.parse)

    @staticmethod
    def clean_items(items):
--- a/FourmiCrawler/sources/source.py
+++ b/FourmiCrawler/sources/source.py
@ -30,7 +30,7 @@ class Source:
        :param compound: A compound name.
        :return: A new Scrapy Request
        """
-        # return Request(url=self.website[:-2] + compound, callback=self.parse)
+        # return Request(url=self.website[:-2].replace("\\", "") + compound, callback=self.parse)
        pass

    def set_spider(self, spider):