Merge branch 'feature/PubChem' into develop

2014-06-17 00:34:04 +02:00 · 2014-06-17 00:34:04 +02:00 · 30f4d09cf9
commit 30f4d09cf9
parent b5071335a1 6e16e9f23e
2 changed files with 114 additions and 1 deletions
--- a/FourmiCrawler/settings.py
+++ b/FourmiCrawler/settings.py
@ -18,8 +18,10 @@ ITEM_PIPELINES = {
 FEED_URI = 'results.json'
 FEED_FORMAT = 'jsonlines'

-
 # Crawl responsibly by identifying yourself (and your website) on the
 # user-agent

+# [todo] - Check for repercussions on spoofing the user agent
+
 # USER_AGENT = 'FourmiCrawler (+http://www.yourdomain.com)'
+USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.137 Safari/537.36'
--- a/FourmiCrawler/sources/PubChem.py
+++ b/FourmiCrawler/sources/PubChem.py
@ -0,0 +1,111 @@
+from scrapy.http import Request
+from scrapy import log
+from source import Source
+from scrapy.selector import Selector
+from FourmiCrawler.items import Result
+import re
+
+
+class PubChem(Source):
+    """ PubChem scraper for chemical properties
+
+        This parser parses the part on PubChem pages that gives Chemical and Physical properties of a substance,
+        including sources of the values of properties.
+    """
+
+    #PubChem has its data on compound name, properties and their values on different html pages, so different URLs used
+    website = 'https://*.ncbi.nlm.nih.gov/*'
+    website_www = 'https://www.ncbi.nlm.nih.gov/*'
+    website_pubchem = 'https://pubchem.ncbi.nlm.nih.gov/*'
+    search = 'pccompound?term=%s'
+    data_url = 'toc/summary_toc.cgi?tocid=27&cid=%s'
+
+    __spider = None
+    searched_compounds = set()
+
+    def __init__(self, config):
+        Source.__init__(self, config)
+        self.cfg = config
+
+    def parse(self, response):
+        """
+        Distributes the above described behaviour
+        :param response: The incoming search request
+        :return Returns the found properties if response is unique or returns none if it's already known
+        """
+        requests = []
+        log.msg('A response from %s just arrived!' % response.url, level=log.DEBUG)
+
+        sel = Selector(response)
+        compound = sel.xpath('//h1/text()').extract()[0]
+        if compound in self.searched_compounds:
+            return None
+
+        self.searched_compounds.update(compound)
+        raw_synonyms = sel.xpath('//div[@class="smalltext"]/text()').extract()[0]
+        for synonym in raw_synonyms.strip().split(', '):
+            log.msg('PubChem synonym found: %s' % synonym, level=log.DEBUG)
+            self.searched_compounds.update(synonym)
+            self._spider.get_synonym_requests(synonym)
+        log.msg('Raw synonyms found: %s' % raw_synonyms, level=log.DEBUG)
+
+        n = re.search(r'cid=(\d+)',response.url)
+        if n:
+            cid = n.group(1)
+        log.msg('cid: %s' % cid, level=log.DEBUG)   #getting the right id of the compound with which it can reach
+                                                # the seperate html page which contains the properties and their values
+
+        #using this cid to get the right url and scrape it
+        requests.append(Request(url=self.website_pubchem[:-1] + self.data_url % cid, callback=self.parse_data))
+        return requests
+
+    def parse_data(self, response):
+        """
+        Parse data found in 'Chemical and Physical properties' part of a substance page.
+        :param response: The response with the page to parse
+        :return: requests: Returns a list of properties with their values, source, etc.
+        """
+        log.msg('parsing data', level=log.DEBUG)
+        requests = []
+
+        sel = Selector(response)
+        props = sel.xpath('//div')
+
+        for prop in props:
+            prop_name = ''.join(prop.xpath('b/text()').extract()) # name of property that it is parsing
+            if prop.xpath('a'):     # parsing for single value in property
+                prop_source = ''.join(prop.xpath('a/@title').extract())
+                prop_value = ''.join(prop.xpath('a/text()').extract())
+                new_prop = Result({
+                    'attribute': prop_name,
+                    'value': prop_value,
+                    'source': prop_source,
+                    'reliability': 'Unknown',
+                    'conditions': ''
+                })
+                log.msg('PubChem prop: |%s| |%s| |%s|' %
+                        (new_prop['attribute'], new_prop['value'],
+                         new_prop['source']), level=log.DEBUG)
+                requests.append(new_prop)
+            elif prop.xpath('ul'):    # parsing for multiple values (list) in property
+                prop_values = prop.xpath('ul//li')
+                for prop_li in prop_values:
+                    prop_value = ''.join(prop_li.xpath('a/text()').extract())
+                    prop_source = ''.join(prop_li.xpath('a/@title').extract())
+                    new_prop = Result({
+                        'attribute': prop_name,
+                        'value': prop_value,
+                        'source': prop_source,
+                        'reliability': 'Unknown',
+                        'conditions': ''
+                    })
+                    log.msg('PubChem prop: |%s| |%s| |%s|' %
+                        (new_prop['attribute'], new_prop['value'],
+                         new_prop['source']), level=log.DEBUG)
+                    requests.append(new_prop)
+
+        return requests
+
+
+    def new_compound_request(self, compound):
+        return Request(url=self.website_www[:-1] + self.search % compound, callback=self.parse)