From 2e95d35283b33bb8134e1a5c2e4fd699f37854fa Mon Sep 17 00:00:00 2001 From: RTB Date: Thu, 17 Apr 2014 21:30:53 +0200 Subject: [PATCH] modified parse_synonyms and new_synonym to include a Selector for future edits --- FourmiCrawler/parsers/ChemSpider.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/FourmiCrawler/parsers/ChemSpider.py b/FourmiCrawler/parsers/ChemSpider.py index ae91e8b..8dc0103 100644 --- a/FourmiCrawler/parsers/ChemSpider.py +++ b/FourmiCrawler/parsers/ChemSpider.py @@ -76,12 +76,17 @@ class ChemSpider(Parser): def parse_synonyms(self, sel): requests = [] synonyms = [] - for syn in sel.xpath('//p[@class="syn"]/strong/text()').extract(): - synonyms.append( self.new_synonym( syn, 'high' ) ) - for syn in sel.xpath('//p[@class="syn"]/span[@class="synonym_confirmed"]/text()').extract(): - synonyms.append( self.new_synonym( syn, 'medium' ) ) - for syn in sel.xpath('//p[@class="syn"]/span[@class=""]/text()').extract(): - synonyms.append( self.new_synonym( syn, 'low' ) ) + for syn in sel.xpath('//p[@class="syn"][strong]'): + name = syn.xpath('strong/text()').extract()[0] + synonyms.append(self.new_synonym(syn, name, 'high')) + for syn in sel.xpath( + '//p[@class="syn"][span[@class="synonym_confirmed"]]'): + name = syn.xpath( + 'span[@class="synonym_confirmed"]/text()').extract()[0] + synonyms.append(self.new_synonym(syn, name, 'medium')) + for syn in sel.xpath('//p[@class="syn"][span[@class=""]]'): + name = syn.xpath('span[@class=""]/text()').extract()[0] + synonyms.append(self.new_synonym(syn, name, 'low')) for synonym in synonyms: if synonym['reliability'] == 'high': @@ -89,7 +94,7 @@ class ChemSpider(Parser): return requests - def new_synonym(self, name, reliability): + def new_synonym(self, sel, name, reliability): log.msg('CS synonym: %s (%s)' % (name, reliability), level=log.DEBUG) self.ignore_list.append(name) synonym = Result()