Escape escape characters
This commit is contained in:
parent
576683dcd0
commit
ef1c319396
@ -23,5 +23,5 @@ FEED_FORMAT = 'jsonlines'
|
||||
|
||||
# [todo] - Check for repercussions on spoofing the user agent
|
||||
|
||||
# USER_AGENT = 'FourmiCrawler (+http://www.yourdomain.com)'
|
||||
USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.137 Safari/537.36'
|
||||
USER_AGENT = 'Fourmi'
|
||||
# USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.137 Safari/537.36'
|
||||
|
@ -19,7 +19,7 @@ class ChemSpider(Source):
|
||||
somewhere.
|
||||
"""
|
||||
|
||||
website = 'http://www\.chemspider\.com/.*'
|
||||
website = 'http://www\\.chemspider\\.com/.*'
|
||||
|
||||
search = 'Search.asmx/SimpleSearch?query=%s&token='
|
||||
structure = 'Chemical-Structure.%s.html'
|
||||
@ -293,6 +293,6 @@ class ChemSpider(Source):
|
||||
"""
|
||||
if compound in self.ignore_list or self.cfg['token'] == '':
|
||||
return None
|
||||
searchurl = self.website[:-2] + self.search % compound
|
||||
searchurl = self.website[:-2].replace("\\", "") + self.search % compound
|
||||
log.msg('chemspider compound', level=log.DEBUG)
|
||||
return Request(url=searchurl, callback=self.parse_searchrequest)
|
||||
|
@ -18,7 +18,7 @@ class NIST(Source):
|
||||
This plugin manages searching for a chemical on the NIST website
|
||||
and parsing the resulting page if the chemical exists on NIST.
|
||||
"""
|
||||
website = "http://webbook\.nist\.gov/.*"
|
||||
website = "http://webbook\\.nist\\.gov/.*"
|
||||
|
||||
search = 'cgi/cbook.cgi?Name=%s&Units=SI&cTP=on'
|
||||
|
||||
@ -329,5 +329,5 @@ class NIST(Source):
|
||||
"""
|
||||
if compound not in self.ignore_list:
|
||||
self.ignore_list.update(compound)
|
||||
return Request(url=self.website[:-2] + self.search % compound,
|
||||
return Request(url=self.website[:-2].replace("\\", "") + self.search % compound,
|
||||
callback=self.parse)
|
||||
|
@ -16,8 +16,8 @@ class PubChem(Source):
|
||||
"""
|
||||
|
||||
#PubChem has its data on compound name, properties and their values on different html pages, so different URLs used
|
||||
website = 'https://.*\.ncbi\.nlm\.nih\.gov/.*'
|
||||
website_www = 'https://www.ncbi.nlm.nih.gov/.*'
|
||||
website = 'https://.*\\.ncbi\\.nlm\\.nih\\.gov/.*'
|
||||
website_www = 'https://www.ncbi.nlm.nih.gov/*'
|
||||
website_pubchem = 'https://pubchem.ncbi.nlm.nih.gov/.*'
|
||||
search = 'pccompound?term=%s'
|
||||
data_url = 'toc/summary_toc.cgi?tocid=27&cid=%s'
|
||||
|
@ -15,7 +15,7 @@ class WikipediaParser(Source):
|
||||
It also returns requests with other external sources which contain information on parsed subject.
|
||||
"""
|
||||
|
||||
website = "http://en\.wikipedia\.org/wiki/.*"
|
||||
website = "http://en\\.wikipedia\\.org/wiki/.*"
|
||||
__spider = None
|
||||
searched_compounds = []
|
||||
|
||||
@ -123,7 +123,7 @@ class WikipediaParser(Source):
|
||||
return items
|
||||
|
||||
def new_compound_request(self, compound):
|
||||
return Request(url=self.website[:-2] + compound, callback=self.parse)
|
||||
return Request(url=self.website[:-2].replace("\\", "") + compound, callback=self.parse)
|
||||
|
||||
@staticmethod
|
||||
def clean_items(items):
|
||||
|
@ -30,7 +30,7 @@ class Source:
|
||||
:param compound: A compound name.
|
||||
:return: A new Scrapy Request
|
||||
"""
|
||||
# return Request(url=self.website[:-2] + compound, callback=self.parse)
|
||||
# return Request(url=self.website[:-2].replace("\\", "") + compound, callback=self.parse)
|
||||
pass
|
||||
|
||||
def set_spider(self, spider):
|
||||
|
Reference in New Issue
Block a user