diff --git a/FourmiCrawler/parsers/ChemSpider.py b/FourmiCrawler/parsers/ChemSpider.py index 01a7c95..fb0b9fa 100644 --- a/FourmiCrawler/parsers/ChemSpider.py +++ b/FourmiCrawler/parsers/ChemSpider.py @@ -16,6 +16,8 @@ class ChemSpider(Parser): search = "Search.asmx/SimpleSearch?query=%s&token=052bfd06-5ce4-43d6-bf12-89eabefd2338" structure = "Chemical-Structure.%s.html" + ignore_list = [] + def parse(self, response): sel = Selector(response) requests = [] @@ -36,6 +38,8 @@ class ChemSpider(Parser): for syn in sel.xpath('//p[@class="syn"]/span[@class=""]/text()').extract(): synonyms.append( self.new_synonym( syn, 'low' ) ) + self.ignore_list.extend(synonyms) + return requests def new_synonym(self, name, reliability): @@ -60,6 +64,8 @@ class ChemSpider(Parser): return Request(structure_url, callback=self.parse) def new_compound_request(self,compound): + if compound in self.ignore_list: #TODO: add regular expression + return None searchurl = self.website[:-1] + self.search % compound log.msg('chemspider compound', level=log.WARNING) return Request(url=searchurl, callback=self.parse_searchrequest)