diff --git a/FourmiCrawler/settings.py b/FourmiCrawler/settings.py index 338f224..ace60ab 100644 --- a/FourmiCrawler/settings.py +++ b/FourmiCrawler/settings.py @@ -23,5 +23,5 @@ FEED_FORMAT = 'jsonlines' # [todo] - Check for repercussions on spoofing the user agent -# USER_AGENT = 'FourmiCrawler (+http://www.yourdomain.com)' -USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.137 Safari/537.36' +USER_AGENT = 'Fourmi' +# USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.137 Safari/537.36' diff --git a/FourmiCrawler/sources/ChemSpider.py b/FourmiCrawler/sources/ChemSpider.py index 23b25fe..6ca5382 100644 --- a/FourmiCrawler/sources/ChemSpider.py +++ b/FourmiCrawler/sources/ChemSpider.py @@ -19,7 +19,7 @@ class ChemSpider(Source): somewhere. """ - website = 'http://www\.chemspider\.com/.*' + website = 'http://www\\.chemspider\\.com/.*' search = 'Search.asmx/SimpleSearch?query=%s&token=' structure = 'Chemical-Structure.%s.html' @@ -293,6 +293,6 @@ class ChemSpider(Source): """ if compound in self.ignore_list or self.cfg['token'] == '': return None - searchurl = self.website[:-2] + self.search % compound + searchurl = self.website[:-2].replace("\\", "") + self.search % compound log.msg('chemspider compound', level=log.DEBUG) return Request(url=searchurl, callback=self.parse_searchrequest) diff --git a/FourmiCrawler/sources/NIST.py b/FourmiCrawler/sources/NIST.py index 904df80..4ad93f5 100644 --- a/FourmiCrawler/sources/NIST.py +++ b/FourmiCrawler/sources/NIST.py @@ -18,7 +18,7 @@ class NIST(Source): This plugin manages searching for a chemical on the NIST website and parsing the resulting page if the chemical exists on NIST. """ - website = "http://webbook\.nist\.gov/.*" + website = "http://webbook\\.nist\\.gov/.*" search = 'cgi/cbook.cgi?Name=%s&Units=SI&cTP=on' @@ -329,5 +329,5 @@ class NIST(Source): """ if compound not in self.ignore_list: self.ignore_list.update(compound) - return Request(url=self.website[:-2] + self.search % compound, + return Request(url=self.website[:-2].replace("\\", "") + self.search % compound, callback=self.parse) diff --git a/FourmiCrawler/sources/PubChem.py b/FourmiCrawler/sources/PubChem.py index 521b02d..5947e54 100644 --- a/FourmiCrawler/sources/PubChem.py +++ b/FourmiCrawler/sources/PubChem.py @@ -16,8 +16,8 @@ class PubChem(Source): """ #PubChem has its data on compound name, properties and their values on different html pages, so different URLs used - website = 'https://.*\.ncbi\.nlm\.nih\.gov/.*' - website_www = 'https://www.ncbi.nlm.nih.gov/.*' + website = 'https://.*\\.ncbi\\.nlm\\.nih\\.gov/.*' + website_www = 'https://www.ncbi.nlm.nih.gov/*' website_pubchem = 'https://pubchem.ncbi.nlm.nih.gov/.*' search = 'pccompound?term=%s' data_url = 'toc/summary_toc.cgi?tocid=27&cid=%s' diff --git a/FourmiCrawler/sources/WikipediaParser.py b/FourmiCrawler/sources/WikipediaParser.py index 385311c..e27bb39 100644 --- a/FourmiCrawler/sources/WikipediaParser.py +++ b/FourmiCrawler/sources/WikipediaParser.py @@ -15,7 +15,7 @@ class WikipediaParser(Source): It also returns requests with other external sources which contain information on parsed subject. """ - website = "http://en\.wikipedia\.org/wiki/.*" + website = "http://en\\.wikipedia\\.org/wiki/.*" __spider = None searched_compounds = [] @@ -123,7 +123,7 @@ class WikipediaParser(Source): return items def new_compound_request(self, compound): - return Request(url=self.website[:-2] + compound, callback=self.parse) + return Request(url=self.website[:-2].replace("\\", "") + compound, callback=self.parse) @staticmethod def clean_items(items): diff --git a/FourmiCrawler/sources/source.py b/FourmiCrawler/sources/source.py index 3ffb47d..a0d3dcd 100644 --- a/FourmiCrawler/sources/source.py +++ b/FourmiCrawler/sources/source.py @@ -30,7 +30,7 @@ class Source: :param compound: A compound name. :return: A new Scrapy Request """ - # return Request(url=self.website[:-2] + compound, callback=self.parse) + # return Request(url=self.website[:-2].replace("\\", "") + compound, callback=self.parse) pass def set_spider(self, spider):