From 859a18c61a260fc1546df78c05e07a24468c91d1 Mon Sep 17 00:00:00 2001 From: RTB Date: Sat, 12 Apr 2014 22:27:28 +0200 Subject: [PATCH] added parsing of synonyms --- FourmiCrawler/parsers/ChemSpider.py | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/FourmiCrawler/parsers/ChemSpider.py b/FourmiCrawler/parsers/ChemSpider.py index ab62578..adfad2f 100644 --- a/FourmiCrawler/parsers/ChemSpider.py +++ b/FourmiCrawler/parsers/ChemSpider.py @@ -19,7 +19,26 @@ class ChemSpider(Parser): def parse(self, response): sel = Selector(response) - log.msg('chemspider parse', level=log.WARNING) + synonyms = [] + for syn in sel.xpath('//p[@class="syn"]/strong/text()').extract(): + synonyms.append( self.new_synonym( syn, 'high' ) ) + for syn in sel.xpath('//p[@class="syn"]/span[@class="synonym_confirmed"]/text()').extract(): + synonyms.append( self.new_synonym( syn, 'medium' ) ) + for syn in sel.xpath('//p[@class="syn"]/span[@class=""]/text()').extract(): + synonyms.append( self.new_synonym( syn, 'low' ) ) + + return synonyms + + def new_synonym(self, name, reliability): + log.msg('CS synonym: %s (%s)' % (name, reliability), level=log.WARNING) + synonym = Result() + synonym['attribute'] = 'synonym' + synonym['value'] = name + synonym['source'] = self.__spider + synonym['reliability'] = reliability + synonym['conditions'] = None + return synonym + def parse_searchrequest(self, response): sel = Selector(response)