diff --git a/FourmiCrawler/sources/ChemSpider.py b/FourmiCrawler/sources/ChemSpider.py index a62f6dd..332c036 100644 --- a/FourmiCrawler/sources/ChemSpider.py +++ b/FourmiCrawler/sources/ChemSpider.py @@ -47,7 +47,6 @@ class ChemSpider(Source): properties = [] # Predicted - ACD/Labs tab - # [TODO] - test if tab contains data, some chemicals do not have data here td_list = sel.xpath('.//table[@id="acdlabs-table"]//td').xpath( 'normalize-space(string())') prop_names = td_list[::2] @@ -58,6 +57,12 @@ class ChemSpider(Source): prop_value = prop_value.extract().encode('utf-8') prop_conditions = '' + # Test for properties without values, with one hardcoded exception + if (not re.match(r'^\d', prop_value) or + (prop_name == 'Polarizability' and + prop_value == '10-24cm3')): + continue + # Match for condition in parentheses m = re.match(r'(.*) \((.*)\)', prop_name) if m: @@ -215,4 +220,4 @@ class ChemSpider(Source): return None searchurl = self.website[:-1] + self.search % compound log.msg('chemspider compound', level=log.DEBUG) - return Request(url=searchurl, callback=self.parse_searchrequest) \ No newline at end of file + return Request(url=searchurl, callback=self.parse_searchrequest)