Merge pull request #3 from Recondor/feature/chemspider-parser-fixes
Feature/chemspider-parser-fixes
This commit is contained in:
commit
afaa0d903f
@ -47,7 +47,6 @@ class ChemSpider(Source):
|
|||||||
properties = []
|
properties = []
|
||||||
|
|
||||||
# Predicted - ACD/Labs tab
|
# Predicted - ACD/Labs tab
|
||||||
# [TODO] - test if tab contains data, some chemicals do not have data here
|
|
||||||
td_list = sel.xpath('.//table[@id="acdlabs-table"]//td').xpath(
|
td_list = sel.xpath('.//table[@id="acdlabs-table"]//td').xpath(
|
||||||
'normalize-space(string())')
|
'normalize-space(string())')
|
||||||
prop_names = td_list[::2]
|
prop_names = td_list[::2]
|
||||||
@ -58,6 +57,12 @@ class ChemSpider(Source):
|
|||||||
prop_value = prop_value.extract().encode('utf-8')
|
prop_value = prop_value.extract().encode('utf-8')
|
||||||
prop_conditions = ''
|
prop_conditions = ''
|
||||||
|
|
||||||
|
# Test for properties without values, with one hardcoded exception
|
||||||
|
if (not re.match(r'^\d', prop_value) or
|
||||||
|
(prop_name == 'Polarizability' and
|
||||||
|
prop_value == '10-24cm3')):
|
||||||
|
continue
|
||||||
|
|
||||||
# Match for condition in parentheses
|
# Match for condition in parentheses
|
||||||
m = re.match(r'(.*) \((.*)\)', prop_name)
|
m = re.match(r'(.*) \((.*)\)', prop_name)
|
||||||
if m:
|
if m:
|
||||||
@ -192,7 +197,8 @@ class ChemSpider(Source):
|
|||||||
'reliability': 'Unknown',
|
'reliability': 'Unknown',
|
||||||
'conditions': ''
|
'conditions': ''
|
||||||
})
|
})
|
||||||
properties.append(result)
|
if result['value']:
|
||||||
|
properties.append(result)
|
||||||
return properties
|
return properties
|
||||||
|
|
||||||
def parse_searchrequest(self, response):
|
def parse_searchrequest(self, response):
|
||||||
@ -200,8 +206,14 @@ class ChemSpider(Source):
|
|||||||
sel = Selector(response)
|
sel = Selector(response)
|
||||||
log.msg('chemspider parse_searchrequest', level=log.DEBUG)
|
log.msg('chemspider parse_searchrequest', level=log.DEBUG)
|
||||||
sel.register_namespace('cs', 'http://www.chemspider.com/')
|
sel.register_namespace('cs', 'http://www.chemspider.com/')
|
||||||
csid = sel.xpath('.//cs:int/text()').extract()[0]
|
csids = sel.xpath('.//cs:int/text()').extract()
|
||||||
# [TODO] - handle multiple csids in case of vague search term
|
if len(csids) == 0:
|
||||||
|
log.msg('ChemSpider found nothing', level=log.ERROR)
|
||||||
|
return
|
||||||
|
elif len(csids) > 1:
|
||||||
|
log.msg('ChemSpider found multiple substances, taking first '
|
||||||
|
'element', level=log.DEBUG)
|
||||||
|
csid = csids[0]
|
||||||
structure_url = self.website[:-1] + self.structure % csid
|
structure_url = self.website[:-1] + self.structure % csid
|
||||||
extendedinfo_url = self.website[:-1] + self.extendedinfo % csid
|
extendedinfo_url = self.website[:-1] + self.extendedinfo % csid
|
||||||
log.msg('chemspider URL: %s' % structure_url, level=log.DEBUG)
|
log.msg('chemspider URL: %s' % structure_url, level=log.DEBUG)
|
||||||
@ -215,4 +227,4 @@ class ChemSpider(Source):
|
|||||||
return None
|
return None
|
||||||
searchurl = self.website[:-1] + self.search % compound
|
searchurl = self.website[:-1] + self.search % compound
|
||||||
log.msg('chemspider compound', level=log.DEBUG)
|
log.msg('chemspider compound', level=log.DEBUG)
|
||||||
return Request(url=searchurl, callback=self.parse_searchrequest)
|
return Request(url=searchurl, callback=self.parse_searchrequest)
|
||||||
|
Reference in New Issue
Block a user