moved scraping of experimental data tab to its own function
This commit is contained in:
parent
41bbe5b1de
commit
b4a724392b
@ -67,28 +67,7 @@ class ChemSpider(Source):
|
|||||||
properties = []
|
properties = []
|
||||||
|
|
||||||
properties.extend(self.parse_acdlabstab(sel))
|
properties.extend(self.parse_acdlabstab(sel))
|
||||||
|
properties.extend(self.parse_experimentaldatatab(sel))
|
||||||
# Experimental Data Tab, Physico-chemical properties in particular
|
|
||||||
scraped_list = sel.xpath('.//li[span="Experimental Physico-chemical '
|
|
||||||
'Properties"]//li/table/tr/td')
|
|
||||||
if not scraped_list:
|
|
||||||
return properties
|
|
||||||
# Format is: property name followed by a list of values
|
|
||||||
property_name = scraped_list.pop(0).xpath(
|
|
||||||
'span/text()').extract()[0].rstrip()
|
|
||||||
for line in scraped_list:
|
|
||||||
if line.xpath('span/text()'):
|
|
||||||
property_name = line.xpath('span/text()').extract()[0].rstrip()
|
|
||||||
else:
|
|
||||||
new_prop = self.newresult(
|
|
||||||
attribute=property_name[:-1],
|
|
||||||
value=line.xpath('text()').extract()[0].rstrip(),
|
|
||||||
source=line.xpath('strong/text()').extract()[0].rstrip(),
|
|
||||||
)
|
|
||||||
properties.append(new_prop)
|
|
||||||
log.msg('CS prop: |%s| |%s| |%s|' %
|
|
||||||
(new_prop['attribute'], new_prop['value'],
|
|
||||||
new_prop['source']), level=log.DEBUG)
|
|
||||||
|
|
||||||
return properties
|
return properties
|
||||||
|
|
||||||
@ -115,13 +94,11 @@ class ChemSpider(Source):
|
|||||||
(prop_name == 'Polarizability' and prop_value == '10-24cm3')):
|
(prop_name == 'Polarizability' and prop_value == '10-24cm3')):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Match for condition in parentheses
|
|
||||||
m = re.match(r'(.*) \((.*)\)', prop_name)
|
m = re.match(r'(.*) \((.*)\)', prop_name)
|
||||||
if m:
|
if m:
|
||||||
prop_name = m.group(1)
|
prop_name = m.group(1)
|
||||||
prop_conditions = m.group(2)
|
prop_conditions = m.group(2)
|
||||||
|
|
||||||
# Match for condition in value seperated by an 'at'
|
|
||||||
m = re.match(r'(.*) at (.*)', prop_value)
|
m = re.match(r'(.*) at (.*)', prop_value)
|
||||||
if m:
|
if m:
|
||||||
prop_value = m.group(1)
|
prop_value = m.group(1)
|
||||||
@ -134,10 +111,35 @@ class ChemSpider(Source):
|
|||||||
conditions=prop_conditions
|
conditions=prop_conditions
|
||||||
)
|
)
|
||||||
properties.append(new_prop)
|
properties.append(new_prop)
|
||||||
log.msg('CS prop: |%s| |%s| |%s|' %
|
|
||||||
(new_prop['attribute'], new_prop['value'],
|
return properties
|
||||||
new_prop['source']),
|
|
||||||
level=log.DEBUG)
|
def parse_experimentaldatatab(self, sel):
|
||||||
|
"""
|
||||||
|
This function scrapes Experimental Data tab, Physico-chemical
|
||||||
|
properties in particular.
|
||||||
|
:param sel: a Selector object of the whole page
|
||||||
|
:return: a list of Result items
|
||||||
|
"""
|
||||||
|
properties = []
|
||||||
|
|
||||||
|
scraped_list = sel.xpath('.//li[span="Experimental Physico-chemical '
|
||||||
|
'Properties"]//li/table/tr/td')
|
||||||
|
if not scraped_list:
|
||||||
|
return properties
|
||||||
|
# Format is: property name followed by a list of values
|
||||||
|
property_name = scraped_list.pop(0).xpath(
|
||||||
|
'span/text()').extract()[0].rstrip()
|
||||||
|
for line in scraped_list:
|
||||||
|
if line.xpath('span/text()'):
|
||||||
|
property_name = line.xpath('span/text()').extract()[0].rstrip()
|
||||||
|
else:
|
||||||
|
new_prop = self.newresult(
|
||||||
|
attribute=property_name[:-1],
|
||||||
|
value=line.xpath('text()').extract()[0].rstrip(),
|
||||||
|
source=line.xpath('strong/text()').extract()[0].rstrip(),
|
||||||
|
)
|
||||||
|
properties.append(new_prop)
|
||||||
|
|
||||||
return properties
|
return properties
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user