From 4b377bb9a966e4b1fd82101e865d70fae0c30b1c Mon Sep 17 00:00:00 2001 From: Nout van Deijck Date: Wed, 21 May 2014 15:25:55 +0200 Subject: [PATCH] PubChem now scrapes its synonyms --- FourmiCrawler/sources/PubChem.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/FourmiCrawler/sources/PubChem.py b/FourmiCrawler/sources/PubChem.py index d34a2cb..0ce727f 100644 --- a/FourmiCrawler/sources/PubChem.py +++ b/FourmiCrawler/sources/PubChem.py @@ -19,12 +19,10 @@ class PubChem(Source): website = 'https://www.ncbi.nlm.nih.gov/*' - - search = 'pccompound?term=%s' __spider = None - searched_compounds = [] + searched_compounds = set() def __init__(self): Source.__init__(self) @@ -34,12 +32,21 @@ class PubChem(Source): log.msg('A response from %s just arrived!' % response.url, level=log.DEBUG) sel = Selector(response) compound = sel.xpath('//h1/text()').extract()[0] + raw_synonyms = sel.xpath('//div[@class="smalltext"]/text()').extract()[0] + for synonym in raw_synonyms.strip().split(', '): + log.msg('PubChem synonym found: %s' % synonym, level=log.DEBUG) + self.searched_compounds.update(synonym) + self._spider.get_synonym_requests(synonym) + + + log.msg('Raw synonyms found: %s' % raw_synonyms, level=log.DEBUG) + if compound in self.searched_compounds: return None else: # items = self.parse_properties(sel) items = [] - self.searched_compounds.append(compound) + self.searched_compounds.update(compound) print items return items