diff --git a/FourmiCrawler/settings.py b/FourmiCrawler/settings.py index 8c1df07..338f224 100644 --- a/FourmiCrawler/settings.py +++ b/FourmiCrawler/settings.py @@ -18,8 +18,10 @@ ITEM_PIPELINES = { FEED_URI = 'results.json' FEED_FORMAT = 'jsonlines' - # Crawl responsibly by identifying yourself (and your website) on the # user-agent +# [todo] - Check for repercussions on spoofing the user agent + # USER_AGENT = 'FourmiCrawler (+http://www.yourdomain.com)' +USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.137 Safari/537.36' diff --git a/FourmiCrawler/sources/PubChem.py b/FourmiCrawler/sources/PubChem.py new file mode 100644 index 0000000..fc8250b --- /dev/null +++ b/FourmiCrawler/sources/PubChem.py @@ -0,0 +1,111 @@ +from scrapy.http import Request +from scrapy import log +from source import Source +from scrapy.selector import Selector +from FourmiCrawler.items import Result +import re + + +class PubChem(Source): + """ PubChem scraper for chemical properties + + This parser parses the part on PubChem pages that gives Chemical and Physical properties of a substance, + including sources of the values of properties. + """ + + #PubChem has its data on compound name, properties and their values on different html pages, so different URLs used + website = 'https://*.ncbi.nlm.nih.gov/*' + website_www = 'https://www.ncbi.nlm.nih.gov/*' + website_pubchem = 'https://pubchem.ncbi.nlm.nih.gov/*' + search = 'pccompound?term=%s' + data_url = 'toc/summary_toc.cgi?tocid=27&cid=%s' + + __spider = None + searched_compounds = set() + + def __init__(self, config): + Source.__init__(self, config) + self.cfg = config + + def parse(self, response): + """ + Distributes the above described behaviour + :param response: The incoming search request + :return Returns the found properties if response is unique or returns none if it's already known + """ + requests = [] + log.msg('A response from %s just arrived!' % response.url, level=log.DEBUG) + + sel = Selector(response) + compound = sel.xpath('//h1/text()').extract()[0] + if compound in self.searched_compounds: + return None + + self.searched_compounds.update(compound) + raw_synonyms = sel.xpath('//div[@class="smalltext"]/text()').extract()[0] + for synonym in raw_synonyms.strip().split(', '): + log.msg('PubChem synonym found: %s' % synonym, level=log.DEBUG) + self.searched_compounds.update(synonym) + self._spider.get_synonym_requests(synonym) + log.msg('Raw synonyms found: %s' % raw_synonyms, level=log.DEBUG) + + n = re.search(r'cid=(\d+)',response.url) + if n: + cid = n.group(1) + log.msg('cid: %s' % cid, level=log.DEBUG) #getting the right id of the compound with which it can reach + # the seperate html page which contains the properties and their values + + #using this cid to get the right url and scrape it + requests.append(Request(url=self.website_pubchem[:-1] + self.data_url % cid, callback=self.parse_data)) + return requests + + def parse_data(self, response): + """ + Parse data found in 'Chemical and Physical properties' part of a substance page. + :param response: The response with the page to parse + :return: requests: Returns a list of properties with their values, source, etc. + """ + log.msg('parsing data', level=log.DEBUG) + requests = [] + + sel = Selector(response) + props = sel.xpath('//div') + + for prop in props: + prop_name = ''.join(prop.xpath('b/text()').extract()) # name of property that it is parsing + if prop.xpath('a'): # parsing for single value in property + prop_source = ''.join(prop.xpath('a/@title').extract()) + prop_value = ''.join(prop.xpath('a/text()').extract()) + new_prop = Result({ + 'attribute': prop_name, + 'value': prop_value, + 'source': prop_source, + 'reliability': 'Unknown', + 'conditions': '' + }) + log.msg('PubChem prop: |%s| |%s| |%s|' % + (new_prop['attribute'], new_prop['value'], + new_prop['source']), level=log.DEBUG) + requests.append(new_prop) + elif prop.xpath('ul'): # parsing for multiple values (list) in property + prop_values = prop.xpath('ul//li') + for prop_li in prop_values: + prop_value = ''.join(prop_li.xpath('a/text()').extract()) + prop_source = ''.join(prop_li.xpath('a/@title').extract()) + new_prop = Result({ + 'attribute': prop_name, + 'value': prop_value, + 'source': prop_source, + 'reliability': 'Unknown', + 'conditions': '' + }) + log.msg('PubChem prop: |%s| |%s| |%s|' % + (new_prop['attribute'], new_prop['value'], + new_prop['source']), level=log.DEBUG) + requests.append(new_prop) + + return requests + + + def new_compound_request(self, compound): + return Request(url=self.website_www[:-1] + self.search % compound, callback=self.parse)