PubChem now scrapes its synonyms

2014-05-21 15:25:55 +02:00 · 2014-05-21 15:25:55 +02:00 · 4b377bb9a9
commit 4b377bb9a9
parent 84f2e3dbea
1 changed files with 11 additions and 4 deletions
--- a/FourmiCrawler/sources/PubChem.py
+++ b/FourmiCrawler/sources/PubChem.py
@ -19,12 +19,10 @@ class PubChem(Source):


    website = 'https://www.ncbi.nlm.nih.gov/*'
-
-
    search = 'pccompound?term=%s'

    __spider = None
-    searched_compounds = []
+    searched_compounds = set()

    def __init__(self):
        Source.__init__(self)
@ -34,12 +32,21 @@ class PubChem(Source):
        log.msg('A response from %s just arrived!' % response.url, level=log.DEBUG)
        sel = Selector(response)
        compound = sel.xpath('//h1/text()').extract()[0]
+        raw_synonyms = sel.xpath('//div[@class="smalltext"]/text()').extract()[0]
+        for synonym in raw_synonyms.strip().split(', '):
+            log.msg('PubChem synonym found: %s' % synonym, level=log.DEBUG)
+            self.searched_compounds.update(synonym)
+            self._spider.get_synonym_requests(synonym)
+
+
+        log.msg('Raw synonyms found: %s' % raw_synonyms, level=log.DEBUG)
+
        if compound in self.searched_compounds:
            return None
        else:
            # items = self.parse_properties(sel)
            items = []
-            self.searched_compounds.append(compound)
+            self.searched_compounds.update(compound)
            print items
            return items