From 564dbc32929d8e45b22f85e219adc22cf2cb1bd5 Mon Sep 17 00:00:00 2001 From: RTB Date: Mon, 14 Apr 2014 00:33:25 +0200 Subject: [PATCH] added ignore list to new_compound_request for synonyms found by chemspider parser --- FourmiCrawler/parsers/ChemSpider.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/FourmiCrawler/parsers/ChemSpider.py b/FourmiCrawler/parsers/ChemSpider.py index 01a7c95..fb0b9fa 100644 --- a/FourmiCrawler/parsers/ChemSpider.py +++ b/FourmiCrawler/parsers/ChemSpider.py @@ -16,6 +16,8 @@ class ChemSpider(Parser): search = "Search.asmx/SimpleSearch?query=%s&token=052bfd06-5ce4-43d6-bf12-89eabefd2338" structure = "Chemical-Structure.%s.html" + ignore_list = [] + def parse(self, response): sel = Selector(response) requests = [] @@ -36,6 +38,8 @@ class ChemSpider(Parser): for syn in sel.xpath('//p[@class="syn"]/span[@class=""]/text()').extract(): synonyms.append( self.new_synonym( syn, 'low' ) ) + self.ignore_list.extend(synonyms) + return requests def new_synonym(self, name, reliability): @@ -60,6 +64,8 @@ class ChemSpider(Parser): return Request(structure_url, callback=self.parse) def new_compound_request(self,compound): + if compound in self.ignore_list: #TODO: add regular expression + return None searchurl = self.website[:-1] + self.search % compound log.msg('chemspider compound', level=log.WARNING) return Request(url=searchurl, callback=self.parse_searchrequest)