From b4a724392bb336885a719a4a2516d1daddf88eab Mon Sep 17 00:00:00 2001 From: RTB Date: Tue, 10 Jun 2014 14:04:39 +0200 Subject: [PATCH] moved scraping of experimental data tab to its own function --- FourmiCrawler/sources/ChemSpider.py | 58 +++++++++++++++-------------- 1 file changed, 30 insertions(+), 28 deletions(-) diff --git a/FourmiCrawler/sources/ChemSpider.py b/FourmiCrawler/sources/ChemSpider.py index ac6d5eb..95a1dee 100644 --- a/FourmiCrawler/sources/ChemSpider.py +++ b/FourmiCrawler/sources/ChemSpider.py @@ -67,28 +67,7 @@ class ChemSpider(Source): properties = [] properties.extend(self.parse_acdlabstab(sel)) - - # Experimental Data Tab, Physico-chemical properties in particular - scraped_list = sel.xpath('.//li[span="Experimental Physico-chemical ' - 'Properties"]//li/table/tr/td') - if not scraped_list: - return properties - # Format is: property name followed by a list of values - property_name = scraped_list.pop(0).xpath( - 'span/text()').extract()[0].rstrip() - for line in scraped_list: - if line.xpath('span/text()'): - property_name = line.xpath('span/text()').extract()[0].rstrip() - else: - new_prop = self.newresult( - attribute=property_name[:-1], - value=line.xpath('text()').extract()[0].rstrip(), - source=line.xpath('strong/text()').extract()[0].rstrip(), - ) - properties.append(new_prop) - log.msg('CS prop: |%s| |%s| |%s|' % - (new_prop['attribute'], new_prop['value'], - new_prop['source']), level=log.DEBUG) + properties.extend(self.parse_experimentaldatatab(sel)) return properties @@ -115,13 +94,11 @@ class ChemSpider(Source): (prop_name == 'Polarizability' and prop_value == '10-24cm3')): continue - # Match for condition in parentheses m = re.match(r'(.*) \((.*)\)', prop_name) if m: prop_name = m.group(1) prop_conditions = m.group(2) - # Match for condition in value seperated by an 'at' m = re.match(r'(.*) at (.*)', prop_value) if m: prop_value = m.group(1) @@ -134,10 +111,35 @@ class ChemSpider(Source): conditions=prop_conditions ) properties.append(new_prop) - log.msg('CS prop: |%s| |%s| |%s|' % - (new_prop['attribute'], new_prop['value'], - new_prop['source']), - level=log.DEBUG) + + return properties + + def parse_experimentaldatatab(self, sel): + """ + This function scrapes Experimental Data tab, Physico-chemical + properties in particular. + :param sel: a Selector object of the whole page + :return: a list of Result items + """ + properties = [] + + scraped_list = sel.xpath('.//li[span="Experimental Physico-chemical ' + 'Properties"]//li/table/tr/td') + if not scraped_list: + return properties + # Format is: property name followed by a list of values + property_name = scraped_list.pop(0).xpath( + 'span/text()').extract()[0].rstrip() + for line in scraped_list: + if line.xpath('span/text()'): + property_name = line.xpath('span/text()').extract()[0].rstrip() + else: + new_prop = self.newresult( + attribute=property_name[:-1], + value=line.xpath('text()').extract()[0].rstrip(), + source=line.xpath('strong/text()').extract()[0].rstrip(), + ) + properties.append(new_prop) return properties