Merge branch 'feature/PubChem-fixes' into develop
This commit is contained in:
commit
335f558aca
@ -21,7 +21,4 @@ FEED_FORMAT = 'jsonlines'
|
|||||||
# Crawl responsibly by identifying yourself (and your website) on the
|
# Crawl responsibly by identifying yourself (and your website) on the
|
||||||
# user-agent
|
# user-agent
|
||||||
|
|
||||||
# [todo] - Check for repercussions on spoofing the user agent
|
USER_AGENT = 'Fourmi'
|
||||||
|
|
||||||
# USER_AGENT = 'FourmiCrawler (+http://www.yourdomain.com)'
|
|
||||||
USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.137 Safari/537.36'
|
|
||||||
|
@ -1,3 +1,5 @@
|
|||||||
|
import re
|
||||||
|
|
||||||
from scrapy import log
|
from scrapy import log
|
||||||
from scrapy.http import Request
|
from scrapy.http import Request
|
||||||
from scrapy.selector import Selector
|
from scrapy.selector import Selector
|
||||||
@ -5,7 +7,6 @@ from scrapy.selector import Selector
|
|||||||
from source import Source
|
from source import Source
|
||||||
from FourmiCrawler.items import Result
|
from FourmiCrawler.items import Result
|
||||||
|
|
||||||
import re
|
|
||||||
|
|
||||||
# [TODO] - Maybe clean up usage of '.extract()[0]', because of possible IndexError exception.
|
# [TODO] - Maybe clean up usage of '.extract()[0]', because of possible IndexError exception.
|
||||||
|
|
||||||
@ -18,7 +19,7 @@ class ChemSpider(Source):
|
|||||||
somewhere.
|
somewhere.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
website = 'http://www.chemspider.com/*'
|
website = 'http://www\\.chemspider\\.com/.*'
|
||||||
|
|
||||||
search = 'Search.asmx/SimpleSearch?query=%s&token='
|
search = 'Search.asmx/SimpleSearch?query=%s&token='
|
||||||
structure = 'Chemical-Structure.%s.html'
|
structure = 'Chemical-Structure.%s.html'
|
||||||
@ -276,8 +277,8 @@ class ChemSpider(Source):
|
|||||||
log.msg('ChemSpider found multiple substances, taking first '
|
log.msg('ChemSpider found multiple substances, taking first '
|
||||||
'element', level=log.DEBUG)
|
'element', level=log.DEBUG)
|
||||||
csid = csids[0]
|
csid = csids[0]
|
||||||
structure_url = self.website[:-1] + self.structure % csid
|
structure_url = self.website[:-2].replace("\\", "") + self.structure % csid
|
||||||
extendedinfo_url = self.website[:-1] + self.extendedinfo % csid
|
extendedinfo_url = self.website[:-2].replace("\\", "") + self.extendedinfo % csid
|
||||||
log.msg('chemspider URL: %s' % structure_url, level=log.DEBUG)
|
log.msg('chemspider URL: %s' % structure_url, level=log.DEBUG)
|
||||||
return [Request(url=structure_url,
|
return [Request(url=structure_url,
|
||||||
callback=self.parse),
|
callback=self.parse),
|
||||||
@ -292,6 +293,6 @@ class ChemSpider(Source):
|
|||||||
"""
|
"""
|
||||||
if compound in self.ignore_list or self.cfg['token'] == '':
|
if compound in self.ignore_list or self.cfg['token'] == '':
|
||||||
return None
|
return None
|
||||||
searchurl = self.website[:-1] + self.search % compound
|
searchurl = self.website[:-2].replace("\\", "") + self.search % compound
|
||||||
log.msg('chemspider compound', level=log.DEBUG)
|
log.msg('chemspider compound', level=log.DEBUG)
|
||||||
return Request(url=searchurl, callback=self.parse_searchrequest)
|
return Request(url=searchurl, callback=self.parse_searchrequest)
|
||||||
|
@ -18,7 +18,7 @@ class NIST(Source):
|
|||||||
This plugin manages searching for a chemical on the NIST website
|
This plugin manages searching for a chemical on the NIST website
|
||||||
and parsing the resulting page if the chemical exists on NIST.
|
and parsing the resulting page if the chemical exists on NIST.
|
||||||
"""
|
"""
|
||||||
website = "http://webbook.nist.gov/*"
|
website = "http://webbook\\.nist\\.gov/.*"
|
||||||
|
|
||||||
search = 'cgi/cbook.cgi?Name=%s&Units=SI&cTP=on'
|
search = 'cgi/cbook.cgi?Name=%s&Units=SI&cTP=on'
|
||||||
|
|
||||||
@ -164,7 +164,7 @@ class NIST(Source):
|
|||||||
extra_data_url = tr.xpath('td[last()][a="Individual data points"]'
|
extra_data_url = tr.xpath('td[last()][a="Individual data points"]'
|
||||||
'/a/@href').extract()
|
'/a/@href').extract()
|
||||||
if extra_data_url:
|
if extra_data_url:
|
||||||
request = Request(url=self.website[:-1] + extra_data_url[0],
|
request = Request(url=self.website[:-2].replace("\\", "") + extra_data_url[0],
|
||||||
callback=self.parse_individual_datapoints)
|
callback=self.parse_individual_datapoints)
|
||||||
results.append(request)
|
results.append(request)
|
||||||
continue
|
continue
|
||||||
@ -329,5 +329,5 @@ class NIST(Source):
|
|||||||
"""
|
"""
|
||||||
if compound not in self.ignore_list:
|
if compound not in self.ignore_list:
|
||||||
self.ignore_list.update(compound)
|
self.ignore_list.update(compound)
|
||||||
return Request(url=self.website[:-1] + self.search % compound,
|
return Request(url=self.website[:-2].replace("\\", "") + self.search % compound,
|
||||||
callback=self.parse)
|
callback=self.parse)
|
||||||
|
@ -1,9 +1,11 @@
|
|||||||
|
import re
|
||||||
|
|
||||||
from scrapy.http import Request
|
from scrapy.http import Request
|
||||||
from scrapy import log
|
from scrapy import log
|
||||||
from source import Source
|
|
||||||
from scrapy.selector import Selector
|
from scrapy.selector import Selector
|
||||||
|
|
||||||
|
from source import Source
|
||||||
from FourmiCrawler.items import Result
|
from FourmiCrawler.items import Result
|
||||||
import re
|
|
||||||
|
|
||||||
|
|
||||||
class PubChem(Source):
|
class PubChem(Source):
|
||||||
@ -14,9 +16,9 @@ class PubChem(Source):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
#PubChem has its data on compound name, properties and their values on different html pages, so different URLs used
|
#PubChem has its data on compound name, properties and their values on different html pages, so different URLs used
|
||||||
website = 'https://*.ncbi.nlm.nih.gov/*'
|
website = 'http://.*\\.ncbi\\.nlm\\.nih\\.gov/.*'
|
||||||
website_www = 'https://www.ncbi.nlm.nih.gov/*'
|
website_www = 'http://www.ncbi.nlm.nih.gov/*'
|
||||||
website_pubchem = 'https://pubchem.ncbi.nlm.nih.gov/*'
|
website_pubchem = 'http://pubchem.ncbi.nlm.nih.gov/.*'
|
||||||
search = 'pccompound?term=%s'
|
search = 'pccompound?term=%s'
|
||||||
data_url = 'toc/summary_toc.cgi?tocid=27&cid=%s'
|
data_url = 'toc/summary_toc.cgi?tocid=27&cid=%s'
|
||||||
|
|
||||||
@ -49,14 +51,14 @@ class PubChem(Source):
|
|||||||
self._spider.get_synonym_requests(synonym)
|
self._spider.get_synonym_requests(synonym)
|
||||||
log.msg('Raw synonyms found: %s' % raw_synonyms, level=log.DEBUG)
|
log.msg('Raw synonyms found: %s' % raw_synonyms, level=log.DEBUG)
|
||||||
|
|
||||||
n = re.search(r'cid=(\d+)',response.url)
|
n = re.search(r'cid=(\d+)', response.url)
|
||||||
if n:
|
if n:
|
||||||
cid = n.group(1)
|
cid = n.group(1)
|
||||||
log.msg('cid: %s' % cid, level=log.DEBUG) #getting the right id of the compound with which it can reach
|
log.msg('cid: %s' % cid, level=log.DEBUG) #getting the right id of the compound with which it can reach
|
||||||
# the seperate html page which contains the properties and their values
|
# the seperate html page which contains the properties and their values
|
||||||
|
|
||||||
#using this cid to get the right url and scrape it
|
#using this cid to get the right url and scrape it
|
||||||
requests.append(Request(url=self.website_pubchem[:-1] + self.data_url % cid, callback=self.parse_data))
|
requests.append(Request(url=self.website_pubchem[:-2].replace("\\","") + self.data_url % cid, callback=self.parse_data))
|
||||||
return requests
|
return requests
|
||||||
|
|
||||||
def parse_data(self, response):
|
def parse_data(self, response):
|
||||||
@ -106,6 +108,41 @@ class PubChem(Source):
|
|||||||
|
|
||||||
return requests
|
return requests
|
||||||
|
|
||||||
|
def parse_searchrequest(self, response):
|
||||||
|
"""
|
||||||
|
This function parses the response to the new_compound_request Request
|
||||||
|
:param response: the Response object to be parsed
|
||||||
|
:return: A Request for the compound page or what self.parse returns in
|
||||||
|
case the search request forwarded to the compound page
|
||||||
|
"""
|
||||||
|
|
||||||
|
#check if pubchem forwarded straight to compound page
|
||||||
|
m = re.match(self.website_pubchem, response.url)
|
||||||
|
if m:
|
||||||
|
log.msg('PubChem search forwarded to compound page',
|
||||||
|
level=log.DEBUG)
|
||||||
|
return self.parse(response)
|
||||||
|
|
||||||
|
sel = Selector(response)
|
||||||
|
|
||||||
|
results = sel.xpath('//div[@class="rsltcont"]')
|
||||||
|
if results:
|
||||||
|
url = results[0].xpath('div/p/a[1]/@href')
|
||||||
|
else:
|
||||||
|
log.msg('PubChem search found nothing or xpath failed',
|
||||||
|
level=log.DEBUG)
|
||||||
|
return None
|
||||||
|
|
||||||
|
if url:
|
||||||
|
url = 'http:' + ''.join(url[0].extract())
|
||||||
|
log.msg('PubChem compound page: %s' % url, level=log.DEBUG)
|
||||||
|
else:
|
||||||
|
log.msg('PubChem search found results, but no url in first result',
|
||||||
|
level=log.DEBUG)
|
||||||
|
return None
|
||||||
|
|
||||||
|
return Request(url=url, callback=self.parse)
|
||||||
|
|
||||||
def new_compound_request(self, compound):
|
def new_compound_request(self, compound):
|
||||||
return Request(url=self.website_www[:-1] + self.search % compound, callback=self.parse)
|
return Request(url=self.website_www[:-1] + self.search % compound,
|
||||||
|
callback=self.parse_searchrequest)
|
||||||
|
@ -15,7 +15,7 @@ class WikipediaParser(Source):
|
|||||||
It also returns requests with other external sources which contain information on parsed subject.
|
It also returns requests with other external sources which contain information on parsed subject.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
website = "http://en.wikipedia.org/wiki/*"
|
website = "http://en\\.wikipedia\\.org/wiki/.*"
|
||||||
__spider = None
|
__spider = None
|
||||||
searched_compounds = []
|
searched_compounds = []
|
||||||
|
|
||||||
@ -123,7 +123,7 @@ class WikipediaParser(Source):
|
|||||||
return items
|
return items
|
||||||
|
|
||||||
def new_compound_request(self, compound):
|
def new_compound_request(self, compound):
|
||||||
return Request(url=self.website[:-1] + compound, callback=self.parse)
|
return Request(url=self.website[:-2].replace("\\", "") + compound, callback=self.parse)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def clean_items(items):
|
def clean_items(items):
|
||||||
|
@ -3,7 +3,7 @@ from scrapy import log
|
|||||||
|
|
||||||
|
|
||||||
class Source:
|
class Source:
|
||||||
website = "http://something/*" # Regex of URI's the source is able to parse
|
website = "http://something/.*" # Regex of URI's the source is able to parse
|
||||||
_spider = None
|
_spider = None
|
||||||
|
|
||||||
def __init__(self, config=None):
|
def __init__(self, config=None):
|
||||||
@ -30,7 +30,7 @@ class Source:
|
|||||||
:param compound: A compound name.
|
:param compound: A compound name.
|
||||||
:return: A new Scrapy Request
|
:return: A new Scrapy Request
|
||||||
"""
|
"""
|
||||||
# return Request(url=self.website[:-1] + compound, callback=self.parse)
|
# return Request(url=self.website[:-2].replace("\\", "") + compound, callback=self.parse)
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def set_spider(self, spider):
|
def set_spider(self, spider):
|
||||||
|
@ -34,8 +34,9 @@ class FourmiSpider(Spider):
|
|||||||
"""
|
"""
|
||||||
for source in self._sources:
|
for source in self._sources:
|
||||||
if re.match(source.website, response.url):
|
if re.match(source.website, response.url):
|
||||||
log.msg("Url: " + response.url + " -> Source: " + source.website, level=log.DEBUG)
|
log.msg("URL: " + response.url + " -> Source: " + source.website, level=log.DEBUG)
|
||||||
return source.parse(response)
|
return source.parse(response)
|
||||||
|
log.msg("URL: " + response.url + " -> No compatible source", level=log.INFO)
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def get_synonym_requests(self, compound, force=False):
|
def get_synonym_requests(self, compound, force=False):
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
import ConfigParser
|
import ConfigParser
|
||||||
|
import os
|
||||||
|
|
||||||
from scrapy.utils.project import get_project_settings
|
from scrapy.utils.project import get_project_settings
|
||||||
import os
|
|
||||||
|
|
||||||
class Configurator:
|
class Configurator:
|
||||||
"""
|
"""
|
||||||
@ -67,7 +67,7 @@ class Configurator:
|
|||||||
:return a ConfigParser object of sources.cfg
|
:return a ConfigParser object of sources.cfg
|
||||||
"""
|
"""
|
||||||
current_dir = os.path.dirname(os.path.abspath(__file__))
|
current_dir = os.path.dirname(os.path.abspath(__file__))
|
||||||
config_path = current_dir + '\..\sources.cfg'
|
config_path = current_dir + '/../sources.cfg'
|
||||||
# [TODO]: location of sources.cfg should be softcoded eventually
|
# [TODO]: location of sources.cfg should be softcoded eventually
|
||||||
config = ConfigParser.ConfigParser()
|
config = ConfigParser.ConfigParser()
|
||||||
config.read(config_path)
|
config.read(config_path)
|
||||||
|
Reference in New Issue
Block a user