Archived
1
0

Merge branch 'feature/PubChem-fixes' into develop

This commit is contained in:
Jip J. Dekker 2014-06-19 22:51:07 +02:00
commit 335f558aca
8 changed files with 63 additions and 27 deletions

View File

@ -21,7 +21,4 @@ FEED_FORMAT = 'jsonlines'
# Crawl responsibly by identifying yourself (and your website) on the # Crawl responsibly by identifying yourself (and your website) on the
# user-agent # user-agent
# [todo] - Check for repercussions on spoofing the user agent USER_AGENT = 'Fourmi'
# USER_AGENT = 'FourmiCrawler (+http://www.yourdomain.com)'
USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.137 Safari/537.36'

View File

@ -1,3 +1,5 @@
import re
from scrapy import log from scrapy import log
from scrapy.http import Request from scrapy.http import Request
from scrapy.selector import Selector from scrapy.selector import Selector
@ -5,7 +7,6 @@ from scrapy.selector import Selector
from source import Source from source import Source
from FourmiCrawler.items import Result from FourmiCrawler.items import Result
import re
# [TODO] - Maybe clean up usage of '.extract()[0]', because of possible IndexError exception. # [TODO] - Maybe clean up usage of '.extract()[0]', because of possible IndexError exception.
@ -18,7 +19,7 @@ class ChemSpider(Source):
somewhere. somewhere.
""" """
website = 'http://www.chemspider.com/*' website = 'http://www\\.chemspider\\.com/.*'
search = 'Search.asmx/SimpleSearch?query=%s&token=' search = 'Search.asmx/SimpleSearch?query=%s&token='
structure = 'Chemical-Structure.%s.html' structure = 'Chemical-Structure.%s.html'
@ -276,8 +277,8 @@ class ChemSpider(Source):
log.msg('ChemSpider found multiple substances, taking first ' log.msg('ChemSpider found multiple substances, taking first '
'element', level=log.DEBUG) 'element', level=log.DEBUG)
csid = csids[0] csid = csids[0]
structure_url = self.website[:-1] + self.structure % csid structure_url = self.website[:-2].replace("\\", "") + self.structure % csid
extendedinfo_url = self.website[:-1] + self.extendedinfo % csid extendedinfo_url = self.website[:-2].replace("\\", "") + self.extendedinfo % csid
log.msg('chemspider URL: %s' % structure_url, level=log.DEBUG) log.msg('chemspider URL: %s' % structure_url, level=log.DEBUG)
return [Request(url=structure_url, return [Request(url=structure_url,
callback=self.parse), callback=self.parse),
@ -292,6 +293,6 @@ class ChemSpider(Source):
""" """
if compound in self.ignore_list or self.cfg['token'] == '': if compound in self.ignore_list or self.cfg['token'] == '':
return None return None
searchurl = self.website[:-1] + self.search % compound searchurl = self.website[:-2].replace("\\", "") + self.search % compound
log.msg('chemspider compound', level=log.DEBUG) log.msg('chemspider compound', level=log.DEBUG)
return Request(url=searchurl, callback=self.parse_searchrequest) return Request(url=searchurl, callback=self.parse_searchrequest)

View File

@ -18,7 +18,7 @@ class NIST(Source):
This plugin manages searching for a chemical on the NIST website This plugin manages searching for a chemical on the NIST website
and parsing the resulting page if the chemical exists on NIST. and parsing the resulting page if the chemical exists on NIST.
""" """
website = "http://webbook.nist.gov/*" website = "http://webbook\\.nist\\.gov/.*"
search = 'cgi/cbook.cgi?Name=%s&Units=SI&cTP=on' search = 'cgi/cbook.cgi?Name=%s&Units=SI&cTP=on'
@ -164,7 +164,7 @@ class NIST(Source):
extra_data_url = tr.xpath('td[last()][a="Individual data points"]' extra_data_url = tr.xpath('td[last()][a="Individual data points"]'
'/a/@href').extract() '/a/@href').extract()
if extra_data_url: if extra_data_url:
request = Request(url=self.website[:-1] + extra_data_url[0], request = Request(url=self.website[:-2].replace("\\", "") + extra_data_url[0],
callback=self.parse_individual_datapoints) callback=self.parse_individual_datapoints)
results.append(request) results.append(request)
continue continue
@ -329,5 +329,5 @@ class NIST(Source):
""" """
if compound not in self.ignore_list: if compound not in self.ignore_list:
self.ignore_list.update(compound) self.ignore_list.update(compound)
return Request(url=self.website[:-1] + self.search % compound, return Request(url=self.website[:-2].replace("\\", "") + self.search % compound,
callback=self.parse) callback=self.parse)

View File

@ -1,9 +1,11 @@
import re
from scrapy.http import Request from scrapy.http import Request
from scrapy import log from scrapy import log
from source import Source
from scrapy.selector import Selector from scrapy.selector import Selector
from source import Source
from FourmiCrawler.items import Result from FourmiCrawler.items import Result
import re
class PubChem(Source): class PubChem(Source):
@ -14,9 +16,9 @@ class PubChem(Source):
""" """
#PubChem has its data on compound name, properties and their values on different html pages, so different URLs used #PubChem has its data on compound name, properties and their values on different html pages, so different URLs used
website = 'https://*.ncbi.nlm.nih.gov/*' website = 'http://.*\\.ncbi\\.nlm\\.nih\\.gov/.*'
website_www = 'https://www.ncbi.nlm.nih.gov/*' website_www = 'http://www.ncbi.nlm.nih.gov/*'
website_pubchem = 'https://pubchem.ncbi.nlm.nih.gov/*' website_pubchem = 'http://pubchem.ncbi.nlm.nih.gov/.*'
search = 'pccompound?term=%s' search = 'pccompound?term=%s'
data_url = 'toc/summary_toc.cgi?tocid=27&cid=%s' data_url = 'toc/summary_toc.cgi?tocid=27&cid=%s'
@ -49,14 +51,14 @@ class PubChem(Source):
self._spider.get_synonym_requests(synonym) self._spider.get_synonym_requests(synonym)
log.msg('Raw synonyms found: %s' % raw_synonyms, level=log.DEBUG) log.msg('Raw synonyms found: %s' % raw_synonyms, level=log.DEBUG)
n = re.search(r'cid=(\d+)',response.url) n = re.search(r'cid=(\d+)', response.url)
if n: if n:
cid = n.group(1) cid = n.group(1)
log.msg('cid: %s' % cid, level=log.DEBUG) #getting the right id of the compound with which it can reach log.msg('cid: %s' % cid, level=log.DEBUG) #getting the right id of the compound with which it can reach
# the seperate html page which contains the properties and their values # the seperate html page which contains the properties and their values
#using this cid to get the right url and scrape it #using this cid to get the right url and scrape it
requests.append(Request(url=self.website_pubchem[:-1] + self.data_url % cid, callback=self.parse_data)) requests.append(Request(url=self.website_pubchem[:-2].replace("\\","") + self.data_url % cid, callback=self.parse_data))
return requests return requests
def parse_data(self, response): def parse_data(self, response):
@ -106,6 +108,41 @@ class PubChem(Source):
return requests return requests
def parse_searchrequest(self, response):
"""
This function parses the response to the new_compound_request Request
:param response: the Response object to be parsed
:return: A Request for the compound page or what self.parse returns in
case the search request forwarded to the compound page
"""
#check if pubchem forwarded straight to compound page
m = re.match(self.website_pubchem, response.url)
if m:
log.msg('PubChem search forwarded to compound page',
level=log.DEBUG)
return self.parse(response)
sel = Selector(response)
results = sel.xpath('//div[@class="rsltcont"]')
if results:
url = results[0].xpath('div/p/a[1]/@href')
else:
log.msg('PubChem search found nothing or xpath failed',
level=log.DEBUG)
return None
if url:
url = 'http:' + ''.join(url[0].extract())
log.msg('PubChem compound page: %s' % url, level=log.DEBUG)
else:
log.msg('PubChem search found results, but no url in first result',
level=log.DEBUG)
return None
return Request(url=url, callback=self.parse)
def new_compound_request(self, compound): def new_compound_request(self, compound):
return Request(url=self.website_www[:-1] + self.search % compound, callback=self.parse) return Request(url=self.website_www[:-1] + self.search % compound,
callback=self.parse_searchrequest)

View File

@ -15,7 +15,7 @@ class WikipediaParser(Source):
It also returns requests with other external sources which contain information on parsed subject. It also returns requests with other external sources which contain information on parsed subject.
""" """
website = "http://en.wikipedia.org/wiki/*" website = "http://en\\.wikipedia\\.org/wiki/.*"
__spider = None __spider = None
searched_compounds = [] searched_compounds = []
@ -123,7 +123,7 @@ class WikipediaParser(Source):
return items return items
def new_compound_request(self, compound): def new_compound_request(self, compound):
return Request(url=self.website[:-1] + compound, callback=self.parse) return Request(url=self.website[:-2].replace("\\", "") + compound, callback=self.parse)
@staticmethod @staticmethod
def clean_items(items): def clean_items(items):

View File

@ -3,7 +3,7 @@ from scrapy import log
class Source: class Source:
website = "http://something/*" # Regex of URI's the source is able to parse website = "http://something/.*" # Regex of URI's the source is able to parse
_spider = None _spider = None
def __init__(self, config=None): def __init__(self, config=None):
@ -30,7 +30,7 @@ class Source:
:param compound: A compound name. :param compound: A compound name.
:return: A new Scrapy Request :return: A new Scrapy Request
""" """
# return Request(url=self.website[:-1] + compound, callback=self.parse) # return Request(url=self.website[:-2].replace("\\", "") + compound, callback=self.parse)
pass pass
def set_spider(self, spider): def set_spider(self, spider):

View File

@ -34,8 +34,9 @@ class FourmiSpider(Spider):
""" """
for source in self._sources: for source in self._sources:
if re.match(source.website, response.url): if re.match(source.website, response.url):
log.msg("Url: " + response.url + " -> Source: " + source.website, level=log.DEBUG) log.msg("URL: " + response.url + " -> Source: " + source.website, level=log.DEBUG)
return source.parse(response) return source.parse(response)
log.msg("URL: " + response.url + " -> No compatible source", level=log.INFO)
return None return None
def get_synonym_requests(self, compound, force=False): def get_synonym_requests(self, compound, force=False):

View File

@ -1,7 +1,7 @@
import ConfigParser import ConfigParser
import os
from scrapy.utils.project import get_project_settings from scrapy.utils.project import get_project_settings
import os
class Configurator: class Configurator:
""" """
@ -67,7 +67,7 @@ class Configurator:
:return a ConfigParser object of sources.cfg :return a ConfigParser object of sources.cfg
""" """
current_dir = os.path.dirname(os.path.abspath(__file__)) current_dir = os.path.dirname(os.path.abspath(__file__))
config_path = current_dir + '\..\sources.cfg' config_path = current_dir + '/../sources.cfg'
# [TODO]: location of sources.cfg should be softcoded eventually # [TODO]: location of sources.cfg should be softcoded eventually
config = ConfigParser.ConfigParser() config = ConfigParser.ConfigParser()
config.read(config_path) config.read(config_path)