Archived
1
0

Escape escape characters

This commit is contained in:
Jip J. Dekker 2014-06-19 22:05:21 +02:00
parent 576683dcd0
commit ef1c319396
6 changed files with 11 additions and 11 deletions

View File

@ -23,5 +23,5 @@ FEED_FORMAT = 'jsonlines'
# [todo] - Check for repercussions on spoofing the user agent
# USER_AGENT = 'FourmiCrawler (+http://www.yourdomain.com)'
USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.137 Safari/537.36'
USER_AGENT = 'Fourmi'
# USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.137 Safari/537.36'

View File

@ -19,7 +19,7 @@ class ChemSpider(Source):
somewhere.
"""
website = 'http://www\.chemspider\.com/.*'
website = 'http://www\\.chemspider\\.com/.*'
search = 'Search.asmx/SimpleSearch?query=%s&token='
structure = 'Chemical-Structure.%s.html'
@ -293,6 +293,6 @@ class ChemSpider(Source):
"""
if compound in self.ignore_list or self.cfg['token'] == '':
return None
searchurl = self.website[:-2] + self.search % compound
searchurl = self.website[:-2].replace("\\", "") + self.search % compound
log.msg('chemspider compound', level=log.DEBUG)
return Request(url=searchurl, callback=self.parse_searchrequest)

View File

@ -18,7 +18,7 @@ class NIST(Source):
This plugin manages searching for a chemical on the NIST website
and parsing the resulting page if the chemical exists on NIST.
"""
website = "http://webbook\.nist\.gov/.*"
website = "http://webbook\\.nist\\.gov/.*"
search = 'cgi/cbook.cgi?Name=%s&Units=SI&cTP=on'
@ -329,5 +329,5 @@ class NIST(Source):
"""
if compound not in self.ignore_list:
self.ignore_list.update(compound)
return Request(url=self.website[:-2] + self.search % compound,
return Request(url=self.website[:-2].replace("\\", "") + self.search % compound,
callback=self.parse)

View File

@ -16,8 +16,8 @@ class PubChem(Source):
"""
#PubChem has its data on compound name, properties and their values on different html pages, so different URLs used
website = 'https://.*\.ncbi\.nlm\.nih\.gov/.*'
website_www = 'https://www.ncbi.nlm.nih.gov/.*'
website = 'https://.*\\.ncbi\\.nlm\\.nih\\.gov/.*'
website_www = 'https://www.ncbi.nlm.nih.gov/*'
website_pubchem = 'https://pubchem.ncbi.nlm.nih.gov/.*'
search = 'pccompound?term=%s'
data_url = 'toc/summary_toc.cgi?tocid=27&cid=%s'

View File

@ -15,7 +15,7 @@ class WikipediaParser(Source):
It also returns requests with other external sources which contain information on parsed subject.
"""
website = "http://en\.wikipedia\.org/wiki/.*"
website = "http://en\\.wikipedia\\.org/wiki/.*"
__spider = None
searched_compounds = []
@ -123,7 +123,7 @@ class WikipediaParser(Source):
return items
def new_compound_request(self, compound):
return Request(url=self.website[:-2] + compound, callback=self.parse)
return Request(url=self.website[:-2].replace("\\", "") + compound, callback=self.parse)
@staticmethod
def clean_items(items):

View File

@ -30,7 +30,7 @@ class Source:
:param compound: A compound name.
:return: A new Scrapy Request
"""
# return Request(url=self.website[:-2] + compound, callback=self.parse)
# return Request(url=self.website[:-2].replace("\\", "") + compound, callback=self.parse)
pass
def set_spider(self, spider):