Escape escape characters
This commit is contained in:
parent
576683dcd0
commit
ef1c319396
@ -23,5 +23,5 @@ FEED_FORMAT = 'jsonlines'
|
|||||||
|
|
||||||
# [todo] - Check for repercussions on spoofing the user agent
|
# [todo] - Check for repercussions on spoofing the user agent
|
||||||
|
|
||||||
# USER_AGENT = 'FourmiCrawler (+http://www.yourdomain.com)'
|
USER_AGENT = 'Fourmi'
|
||||||
USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.137 Safari/537.36'
|
# USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.137 Safari/537.36'
|
||||||
|
@ -19,7 +19,7 @@ class ChemSpider(Source):
|
|||||||
somewhere.
|
somewhere.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
website = 'http://www\.chemspider\.com/.*'
|
website = 'http://www\\.chemspider\\.com/.*'
|
||||||
|
|
||||||
search = 'Search.asmx/SimpleSearch?query=%s&token='
|
search = 'Search.asmx/SimpleSearch?query=%s&token='
|
||||||
structure = 'Chemical-Structure.%s.html'
|
structure = 'Chemical-Structure.%s.html'
|
||||||
@ -293,6 +293,6 @@ class ChemSpider(Source):
|
|||||||
"""
|
"""
|
||||||
if compound in self.ignore_list or self.cfg['token'] == '':
|
if compound in self.ignore_list or self.cfg['token'] == '':
|
||||||
return None
|
return None
|
||||||
searchurl = self.website[:-2] + self.search % compound
|
searchurl = self.website[:-2].replace("\\", "") + self.search % compound
|
||||||
log.msg('chemspider compound', level=log.DEBUG)
|
log.msg('chemspider compound', level=log.DEBUG)
|
||||||
return Request(url=searchurl, callback=self.parse_searchrequest)
|
return Request(url=searchurl, callback=self.parse_searchrequest)
|
||||||
|
@ -18,7 +18,7 @@ class NIST(Source):
|
|||||||
This plugin manages searching for a chemical on the NIST website
|
This plugin manages searching for a chemical on the NIST website
|
||||||
and parsing the resulting page if the chemical exists on NIST.
|
and parsing the resulting page if the chemical exists on NIST.
|
||||||
"""
|
"""
|
||||||
website = "http://webbook\.nist\.gov/.*"
|
website = "http://webbook\\.nist\\.gov/.*"
|
||||||
|
|
||||||
search = 'cgi/cbook.cgi?Name=%s&Units=SI&cTP=on'
|
search = 'cgi/cbook.cgi?Name=%s&Units=SI&cTP=on'
|
||||||
|
|
||||||
@ -329,5 +329,5 @@ class NIST(Source):
|
|||||||
"""
|
"""
|
||||||
if compound not in self.ignore_list:
|
if compound not in self.ignore_list:
|
||||||
self.ignore_list.update(compound)
|
self.ignore_list.update(compound)
|
||||||
return Request(url=self.website[:-2] + self.search % compound,
|
return Request(url=self.website[:-2].replace("\\", "") + self.search % compound,
|
||||||
callback=self.parse)
|
callback=self.parse)
|
||||||
|
@ -16,8 +16,8 @@ class PubChem(Source):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
#PubChem has its data on compound name, properties and their values on different html pages, so different URLs used
|
#PubChem has its data on compound name, properties and their values on different html pages, so different URLs used
|
||||||
website = 'https://.*\.ncbi\.nlm\.nih\.gov/.*'
|
website = 'https://.*\\.ncbi\\.nlm\\.nih\\.gov/.*'
|
||||||
website_www = 'https://www.ncbi.nlm.nih.gov/.*'
|
website_www = 'https://www.ncbi.nlm.nih.gov/*'
|
||||||
website_pubchem = 'https://pubchem.ncbi.nlm.nih.gov/.*'
|
website_pubchem = 'https://pubchem.ncbi.nlm.nih.gov/.*'
|
||||||
search = 'pccompound?term=%s'
|
search = 'pccompound?term=%s'
|
||||||
data_url = 'toc/summary_toc.cgi?tocid=27&cid=%s'
|
data_url = 'toc/summary_toc.cgi?tocid=27&cid=%s'
|
||||||
|
@ -15,7 +15,7 @@ class WikipediaParser(Source):
|
|||||||
It also returns requests with other external sources which contain information on parsed subject.
|
It also returns requests with other external sources which contain information on parsed subject.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
website = "http://en\.wikipedia\.org/wiki/.*"
|
website = "http://en\\.wikipedia\\.org/wiki/.*"
|
||||||
__spider = None
|
__spider = None
|
||||||
searched_compounds = []
|
searched_compounds = []
|
||||||
|
|
||||||
@ -123,7 +123,7 @@ class WikipediaParser(Source):
|
|||||||
return items
|
return items
|
||||||
|
|
||||||
def new_compound_request(self, compound):
|
def new_compound_request(self, compound):
|
||||||
return Request(url=self.website[:-2] + compound, callback=self.parse)
|
return Request(url=self.website[:-2].replace("\\", "") + compound, callback=self.parse)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def clean_items(items):
|
def clean_items(items):
|
||||||
|
@ -30,7 +30,7 @@ class Source:
|
|||||||
:param compound: A compound name.
|
:param compound: A compound name.
|
||||||
:return: A new Scrapy Request
|
:return: A new Scrapy Request
|
||||||
"""
|
"""
|
||||||
# return Request(url=self.website[:-2] + compound, callback=self.parse)
|
# return Request(url=self.website[:-2].replace("\\", "") + compound, callback=self.parse)
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def set_spider(self, spider):
|
def set_spider(self, spider):
|
||||||
|
Reference in New Issue
Block a user