Archived
1
0

Merge branch 'feature/chemspider-parser' of code.giphouse.nl:giphouse/descartes-2 into feature/chemspider-parser

This commit is contained in:
Jip J. Dekker 2014-04-16 17:01:09 +02:00
commit bd7fb38497

View File

@ -15,6 +15,7 @@ class ChemSpider(Parser):
search = "Search.asmx/SimpleSearch?query=%s&token=052bfd06-5ce4-43d6-bf12-89eabefd2338" search = "Search.asmx/SimpleSearch?query=%s&token=052bfd06-5ce4-43d6-bf12-89eabefd2338"
structure = "Chemical-Structure.%s.html" structure = "Chemical-Structure.%s.html"
extendedinfo = "MassSpecAPI.asmx/GetExtendedCompoundInfo?csid=%s&token=052bfd06-5ce4-43d6-bf12-89eabefd2338"
ignore_list = [] ignore_list = []
@ -25,7 +26,7 @@ class ChemSpider(Parser):
requests.extend(requests_synonyms) requests.extend(requests_synonyms)
requests_properties = self.parse_properties(sel) requests_properties = self.parse_properties(sel)
requests.extend(requests_properties) requests.extend(requests_properties)
for wiki_url in sel.xpath('.//a[@title="Wiki"]/@href').extract(): for wiki_url in sel.xpath('.//p[@class="syn"][strong]/a[@title="Wiki"]/@href').extract():
requests.append( Request(url=wiki_url) ) requests.append( Request(url=wiki_url) )
return requests return requests
@ -38,16 +39,17 @@ class ChemSpider(Parser):
prop_names = td_list[::2] prop_names = td_list[::2]
prop_values = td_list[1::2] prop_values = td_list[1::2]
for i, prop_name in enumerate(prop_names): for i, prop_name in enumerate(prop_names):
new_prop = Result() new_prop = Result({
new_prop['attribute'] = prop_name.extract().encode('utf-8') 'attribute': prop_name.extract().encode('utf-8'),
new_prop['value'] = prop_values[i].extract().encode('utf-8') 'value': prop_values[i].extract().encode('utf-8'),
new_prop['source'] = 'ChemSpider Predicted - ACD/Labs Tab' 'source': 'ChemSpider Predicted - ACD/Labs Tab',
new_prop['reliability'] = None 'reliability': '',
new_prop['conditions'] = None 'conditions': ''
})
properties.append(new_prop) properties.append(new_prop)
log.msg('CS prop: |%s| |%s| |%s|' \ log.msg('CS prop: |%s| |%s| |%s|' \
% (new_prop['attribute'],new_prop['value'], new_prop['source']), % (new_prop['attribute'],new_prop['value'], new_prop['source']),
level=log.WARNING) level=log.DEBUG)
scraped_list = sel.xpath('.//li[span="Experimental Physico-chemical Properties"]//li/table/tr/td') scraped_list = sel.xpath('.//li[span="Experimental Physico-chemical Properties"]//li/table/tr/td')
if not scraped_list: if not scraped_list:
@ -57,16 +59,17 @@ class ChemSpider(Parser):
if line.xpath('span/text()'): if line.xpath('span/text()'):
property_name = line.xpath('span/text()').extract()[0].rstrip() property_name = line.xpath('span/text()').extract()[0].rstrip()
else: else:
new_prop = Result() new_prop = Result({
new_prop['attribute'] = property_name 'attribute': property_name,
new_prop['value'] = line.xpath('text()').extract()[0].rstrip() 'value': line.xpath('text()').extract()[0].rstrip(),
new_prop['source'] = line.xpath('strong/text()').extract()[0].rstrip() 'source': line.xpath('strong/text()').extract()[0].rstrip(),
new_prop['reliability'] = None 'reliability': '',
new_prop['conditions'] = None 'conditions': ''
})
properties.append(new_prop) properties.append(new_prop)
log.msg('CS prop: |%s| |%s| |%s|' \ log.msg('CS prop: |%s| |%s| |%s|' \
% (new_prop['attribute'],new_prop['value'], new_prop['source']), % (new_prop['attribute'],new_prop['value'], new_prop['source']),
level=log.WARNING) level=log.DEBUG)
return properties return properties
@ -87,30 +90,47 @@ class ChemSpider(Parser):
return requests return requests
def new_synonym(self, name, reliability): def new_synonym(self, name, reliability):
log.msg('CS synonym: %s (%s)' % (name, reliability), level=log.WARNING) log.msg('CS synonym: %s (%s)' % (name, reliability), level=log.DEBUG)
self.ignore_list.append(name) self.ignore_list.append(name)
synonym = Result() synonym = Result()
synonym['attribute'] = 'synonym' synonym['attribute'] = 'synonym'
synonym['value'] = name synonym['value'] = name
synonym['source'] = 'ChemSpider' synonym['source'] = 'ChemSpider'
synonym['reliability'] = reliability synonym['reliability'] = reliability
synonym['conditions'] = None synonym['conditions'] = ''
return synonym return synonym
def parse_extendedinfo(self, response):
sel = Selector(response)
properties = []
names = sel.xpath('*').xpath('name()').extract()
values = sel.xpath('*').xpath('text()').extract()
for (name, value) in zip(names,values):
result = Result({
'attribute': name,
'value': value,
'source': 'ChemSpider',
'reliability': '',
'conditions': ''
})
properties.append(result)
return properties
def parse_searchrequest(self, response): def parse_searchrequest(self, response):
sel = Selector(response) sel = Selector(response)
log.msg('chemspider parse_searchrequest', level=log.WARNING) log.msg('chemspider parse_searchrequest', level=log.DEBUG)
sel.register_namespace('cs', 'http://www.chemspider.com/') sel.register_namespace('cs', 'http://www.chemspider.com/')
csid = sel.xpath('.//cs:int/text()').extract()[0] csid = sel.xpath('.//cs:int/text()').extract()[0]
#TODO: handle multiple csids in case of vague search term #TODO: handle multiple csids in case of vague search term
structure_url = self.website[:-1] + self.structure % csid structure_url = self.website[:-1] + self.structure % csid
log.msg('chemspider URL: %s' % structure_url, level=log.WARNING) extendedinfo_url = self.website[:-1] + self.extendedinfo % csid
return Request(structure_url, callback=self.parse) log.msg('chemspider URL: %s' % structure_url, level=log.DEBUG)
return [Request(url=structure_url, callback=self.parse),
Request(url=extendedinfo_url, callback=self.parse_extendedinfo)]
def new_compound_request(self,compound): def new_compound_request(self,compound):
if compound in self.ignore_list: #TODO: add regular expression if compound in self.ignore_list: #TODO: add regular expression
return None return None
searchurl = self.website[:-1] + self.search % compound searchurl = self.website[:-1] + self.search % compound
log.msg('chemspider compound', level=log.WARNING) log.msg('chemspider compound', level=log.DEBUG)
return Request(url=searchurl, callback=self.parse_searchrequest) return Request(url=searchurl, callback=self.parse_searchrequest)