From 472aae86be443c8372c356b0509e13cabd3b1c9d Mon Sep 17 00:00:00 2001 From: RTB Date: Sat, 17 May 2014 19:32:20 +0200 Subject: [PATCH] synonyms are now scraped --- FourmiCrawler/sources/NIST.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/FourmiCrawler/sources/NIST.py b/FourmiCrawler/sources/NIST.py index a969384..4bb8e30 100644 --- a/FourmiCrawler/sources/NIST.py +++ b/FourmiCrawler/sources/NIST.py @@ -79,6 +79,12 @@ class NIST(Source): ul = sel.xpath('body/ul[li/strong="IUPAC Standard InChI:"]') li = ul.xpath('li') + raw_synonyms = ul.xpath('li[strong="Other names:"]/text()').extract() + for synonym in raw_synonyms[0].strip().split(';\n'): + log.msg('NIST synonym: %s' % synonym, level=log.DEBUG) + self.ignore_list.update(synonym) + self._spider.get_synonym_requests(synonym) + data = {} raw_formula = ul.xpath('li[strong/a="Formula"]//text()').extract()