Merge branch 'develop' into feature/GUI
This commit is contained in:
commit
520216b528
@ -1,3 +1,7 @@
|
||||
### v0.6.0
|
||||
- FIX: Using absolute path for configuration files
|
||||
- DEV: General Code cleanup in documentation
|
||||
|
||||
### v0.5.3
|
||||
- FIX: It is now again possible to use both verbose and the source inclusion/exclusion options
|
||||
- FIX: Logging is now "actually" disabled if not using the verbose option.
|
||||
|
@ -21,7 +21,4 @@ FEED_FORMAT = 'jsonlines'
|
||||
# Crawl responsibly by identifying yourself (and your website) on the
|
||||
# user-agent
|
||||
|
||||
# [todo] - Check for repercussions on spoofing the user agent
|
||||
|
||||
# USER_AGENT = 'FourmiCrawler (+http://www.yourdomain.com)'
|
||||
USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.137 Safari/537.36'
|
||||
USER_AGENT = 'Fourmi'
|
||||
|
@ -9,24 +9,28 @@ from FourmiCrawler.items import Result
|
||||
|
||||
|
||||
# [TODO] - Maybe clean up usage of '.extract()[0]', because of possible IndexError exception.
|
||||
# [TODO] - Add checks at search request and extendedCompoundInfo on whether the token was valid or not
|
||||
|
||||
class ChemSpider(Source):
|
||||
"""ChemSpider scraper for synonyms and properties
|
||||
|
||||
"""
|
||||
ChemSpider scraper for synonyms and properties
|
||||
This parser will manage searching for chemicals through the
|
||||
ChemsSpider API, and parsing the resulting ChemSpider page.
|
||||
The token required for the API should be in a configuration file
|
||||
somewhere.
|
||||
"""
|
||||
|
||||
website = 'http://www.chemspider.com/*'
|
||||
website = 'http://www\\.chemspider\\.com/.*'
|
||||
|
||||
search = 'Search.asmx/SimpleSearch?query=%s&token='
|
||||
structure = 'Chemical-Structure.%s.html'
|
||||
extendedinfo = 'MassSpecAPI.asmx/GetExtendedCompoundInfo?csid=%s&token='
|
||||
|
||||
def __init__(self, config=None):
|
||||
"""
|
||||
Initialization of ChemSpider scraper
|
||||
:param config: a dictionary of settings for this scraper, must contain
|
||||
'reliability' key
|
||||
"""
|
||||
Source.__init__(self, config)
|
||||
self.ignore_list = []
|
||||
if 'token' not in self.cfg or self.cfg['token'] == '':
|
||||
@ -37,6 +41,12 @@ class ChemSpider(Source):
|
||||
self.extendedinfo += self.cfg['token']
|
||||
|
||||
def parse(self, response):
|
||||
"""
|
||||
This function is called when a Response matching the variable
|
||||
'website' is available for parsing the Response object.
|
||||
:param response: the Scrapy Response object to be parsed
|
||||
:return: a list of Result items and Request objects
|
||||
"""
|
||||
sel = Selector(response)
|
||||
requests = []
|
||||
requests_synonyms = self.parse_synonyms(sel)
|
||||
@ -47,10 +57,26 @@ class ChemSpider(Source):
|
||||
return requests
|
||||
|
||||
def parse_properties(self, sel):
|
||||
"""scrape Experimental Data and Predicted ACD/Labs tabs"""
|
||||
"""
|
||||
This function scrapes the Experimental Data and Predicted ACD/Labs tabs
|
||||
:param sel: a Selector object of the whole page
|
||||
:return: a list of Result items
|
||||
"""
|
||||
properties = []
|
||||
|
||||
properties.extend(self.parse_acdlabstab(sel))
|
||||
properties.extend(self.parse_experimentaldatatab(sel))
|
||||
|
||||
return properties
|
||||
|
||||
def parse_acdlabstab(self, sel):
|
||||
"""
|
||||
This function scrapes the 'Predicted ACD/Labs tab' under Properties
|
||||
:param sel: a Selector object of the whole page
|
||||
:return: a list of Request objects
|
||||
"""
|
||||
properties = []
|
||||
|
||||
# Predicted - ACD/Labs tab
|
||||
td_list = sel.xpath('.//table[@id="acdlabs-table"]//td').xpath(
|
||||
'normalize-space(string())')
|
||||
prop_names = td_list[::2]
|
||||
@ -62,16 +88,15 @@ class ChemSpider(Source):
|
||||
prop_conditions = ''
|
||||
|
||||
# Test for properties without values, with one hardcoded exception
|
||||
if not re.match(r'^\d', prop_value) or (prop_name == 'Polarizability' and prop_value == '10-24cm3'):
|
||||
if (not re.match(r'^\d', prop_value) or
|
||||
(prop_name == 'Polarizability' and prop_value == '10-24cm3')):
|
||||
continue
|
||||
|
||||
# Match for condition in parentheses
|
||||
m = re.match(r'(.*) \((.*)\)', prop_name)
|
||||
if m:
|
||||
prop_name = m.group(1)
|
||||
prop_conditions = m.group(2)
|
||||
|
||||
# Match for condition in value seperated by an 'at'
|
||||
m = re.match(r'(.*) at (.*)', prop_value)
|
||||
if m:
|
||||
prop_value = m.group(1)
|
||||
@ -84,11 +109,18 @@ class ChemSpider(Source):
|
||||
conditions=prop_conditions
|
||||
)
|
||||
properties.append(new_prop)
|
||||
log.msg('CS prop: |%s| |%s| |%s|' %
|
||||
(new_prop['attribute'], new_prop['value'], new_prop['source']),
|
||||
level=log.DEBUG)
|
||||
|
||||
# Experimental Data Tab, Physico-chemical properties in particular
|
||||
return properties
|
||||
|
||||
def parse_experimentaldatatab(self, sel):
|
||||
"""
|
||||
This function scrapes Experimental Data tab, Physico-chemical
|
||||
properties in particular.
|
||||
:param sel: a Selector object of the whole page
|
||||
:return: a list of Result items
|
||||
"""
|
||||
properties = []
|
||||
|
||||
scraped_list = sel.xpath('.//li[span="Experimental Physico-chemical '
|
||||
'Properties"]//li/table/tr/td')
|
||||
if not scraped_list:
|
||||
@ -106,14 +138,15 @@ class ChemSpider(Source):
|
||||
source=line.xpath('strong/text()').extract()[0].rstrip(),
|
||||
)
|
||||
properties.append(new_prop)
|
||||
log.msg('CS prop: |%s| |%s| |%s|' %
|
||||
(new_prop['attribute'], new_prop['value'],
|
||||
new_prop['source']), level=log.DEBUG)
|
||||
|
||||
return properties
|
||||
|
||||
def parse_synonyms(self, sel):
|
||||
"""Scrape list of Names and Identifiers"""
|
||||
"""
|
||||
This function scrapes the list of Names and Identifiers
|
||||
:param sel: a Selector object of the whole page
|
||||
:return: a list of Requests
|
||||
"""
|
||||
requests = []
|
||||
synonyms = []
|
||||
|
||||
@ -145,7 +178,13 @@ class ChemSpider(Source):
|
||||
return requests
|
||||
|
||||
def new_synonym(self, sel, name, category):
|
||||
"""Scrape for a single synonym at a given HTML tag"""
|
||||
"""
|
||||
This function scrapes for a single synonym at a given HTML tag
|
||||
:param sel: a Selector object of the given HTML tag
|
||||
:param name: the name of the synonym in the tag
|
||||
:param category: the name of the category the synonym is labeled as
|
||||
:return: a dictionary containing data on the synonym
|
||||
"""
|
||||
self.ignore_list.append(name)
|
||||
language = sel.xpath('span[@class="synonym_language"]/text()')
|
||||
if language:
|
||||
@ -181,7 +220,12 @@ class ChemSpider(Source):
|
||||
return synonym
|
||||
|
||||
def parse_extendedinfo(self, response):
|
||||
"""Scrape data from the ChemSpider GetExtendedCompoundInfo API"""
|
||||
"""
|
||||
This function scrapes data from the ChemSpider GetExtendedCompoundInfo
|
||||
API, if a token is present in the configuration settings
|
||||
:param response: a Response object to be parsed
|
||||
:return: a list of Result items
|
||||
"""
|
||||
sel = Selector(response)
|
||||
properties = []
|
||||
names = sel.xpath('*').xpath('name()').extract()
|
||||
@ -197,8 +241,16 @@ class ChemSpider(Source):
|
||||
return properties
|
||||
|
||||
def newresult(self, attribute, value, conditions='', source='ChemSpider'):
|
||||
return Result(
|
||||
{
|
||||
"""
|
||||
This function abstracts from the Result item and provides default
|
||||
values.
|
||||
:param attribute: the name of the attribute
|
||||
:param value: the value of the attribute
|
||||
:param conditions: optional conditions regarding the value
|
||||
:param source: the name of the source if it is not ChemSpider
|
||||
:return: A Result item
|
||||
"""
|
||||
return Result({
|
||||
'attribute': attribute,
|
||||
'value': value,
|
||||
'source': source,
|
||||
@ -207,7 +259,13 @@ class ChemSpider(Source):
|
||||
})
|
||||
|
||||
def parse_searchrequest(self, response):
|
||||
"""Parse the initial response of the ChemSpider Search API """
|
||||
"""
|
||||
This function parses the initial response of the ChemSpider Search API
|
||||
Requires a valid token to function.
|
||||
:param response: the Response object to be parsed
|
||||
:return: A Request for the information page and a Request for the
|
||||
extendedinfo API call
|
||||
"""
|
||||
sel = Selector(response)
|
||||
log.msg('chemspider parse_searchrequest', level=log.DEBUG)
|
||||
sel.register_namespace('cs', 'http://www.chemspider.com/')
|
||||
@ -219,8 +277,8 @@ class ChemSpider(Source):
|
||||
log.msg('ChemSpider found multiple substances, taking first '
|
||||
'element', level=log.DEBUG)
|
||||
csid = csids[0]
|
||||
structure_url = self.website[:-1] + self.structure % csid
|
||||
extendedinfo_url = self.website[:-1] + self.extendedinfo % csid
|
||||
structure_url = self.website[:-2].replace("\\", "") + self.structure % csid
|
||||
extendedinfo_url = self.website[:-2].replace("\\", "") + self.extendedinfo % csid
|
||||
log.msg('chemspider URL: %s' % structure_url, level=log.DEBUG)
|
||||
return [Request(url=structure_url,
|
||||
callback=self.parse),
|
||||
@ -228,8 +286,13 @@ class ChemSpider(Source):
|
||||
callback=self.parse_extendedinfo)]
|
||||
|
||||
def new_compound_request(self, compound):
|
||||
"""
|
||||
This function is called when a new synonym is returned to the spider
|
||||
to generate new requests
|
||||
:param compound: the name of the compound to search for
|
||||
"""
|
||||
if compound in self.ignore_list or self.cfg['token'] == '':
|
||||
return None
|
||||
searchurl = self.website[:-1] + self.search % compound
|
||||
searchurl = self.website[:-2].replace("\\", "") + self.search % compound
|
||||
log.msg('chemspider compound', level=log.DEBUG)
|
||||
return Request(url=searchurl, callback=self.parse_searchrequest)
|
||||
|
@ -13,20 +13,31 @@ from FourmiCrawler.items import Result
|
||||
# Result item, but should be included eventually.
|
||||
|
||||
class NIST(Source):
|
||||
"""NIST Scraper plugin
|
||||
|
||||
"""
|
||||
NIST Scraper plugin
|
||||
This plugin manages searching for a chemical on the NIST website
|
||||
and parsing the resulting page if the chemical exists on NIST.
|
||||
"""
|
||||
website = "http://webbook.nist.gov/*"
|
||||
website = "http://webbook\\.nist\\.gov/.*"
|
||||
|
||||
search = 'cgi/cbook.cgi?Name=%s&Units=SI&cTP=on'
|
||||
|
||||
def __init__(self, config=None):
|
||||
"""
|
||||
Initialization of NIST scraper
|
||||
:param config: configuration variables for this scraper, must contain
|
||||
'reliability' key.
|
||||
"""
|
||||
Source.__init__(self, config)
|
||||
self.ignore_list = set()
|
||||
|
||||
def parse(self, response):
|
||||
"""
|
||||
This function is called when a Response matching the variable
|
||||
'website' is available for parsing the Response object.
|
||||
:param response: The Scrapy Response object to be parsed
|
||||
:return: a list of Result items and Request objects
|
||||
"""
|
||||
sel = Selector(response)
|
||||
|
||||
title = sel.xpath('head/title/text()').extract()[0]
|
||||
@ -51,6 +62,21 @@ class NIST(Source):
|
||||
log.msg('NIST symbol: |%s|, name: |%s|' % (symbol, name),
|
||||
level=log.DEBUG)
|
||||
|
||||
requests.extend(self.parse_tables(sel, symbol_table))
|
||||
|
||||
return requests
|
||||
|
||||
def parse_tables(self, sel, symbol_table):
|
||||
"""
|
||||
This function identifies and distributes parsing of tables to other
|
||||
functions below.
|
||||
:param sel: A Selector object of the whole page
|
||||
:param symbol_table: a dictionary containing translations of raw HTML
|
||||
tags to human readable names
|
||||
:return: a list of Result items and Requests
|
||||
"""
|
||||
requests = []
|
||||
|
||||
for table in sel.xpath('//table[@class="data"]'):
|
||||
summary = table.xpath('@summary').extract()[0]
|
||||
if summary == 'One dimensional data':
|
||||
@ -81,8 +107,12 @@ class NIST(Source):
|
||||
return requests
|
||||
|
||||
def parse_generic_info(self, sel):
|
||||
"""Parses: synonyms, chemical formula, molecular weight, InChI,
|
||||
InChiKey, CAS number
|
||||
"""
|
||||
This function parses: synonyms, chemical formula, molecular weight,
|
||||
InChI, InChiKey, CAS number
|
||||
:param sel: A Selector object of the entire page in the original
|
||||
response
|
||||
:return: a list of Result items
|
||||
"""
|
||||
ul = sel.xpath('body/ul[li/strong="IUPAC Standard InChI:"]')
|
||||
|
||||
@ -121,15 +151,20 @@ class NIST(Source):
|
||||
return requests
|
||||
|
||||
def parse_aggregate_data(self, table, symbol_table):
|
||||
"""Parses the table(s) which contain possible links to individual
|
||||
data points
|
||||
"""
|
||||
This function parses the table(s) which contain possible links to
|
||||
individual data points
|
||||
:param table: a Selector object of the table to be parsed
|
||||
:param symbol_table: a dictionary containing translations of raw HTML
|
||||
tags to human readable names
|
||||
:return: a list of Result items and Request objects
|
||||
"""
|
||||
results = []
|
||||
for tr in table.xpath('tr[td]'):
|
||||
extra_data_url = tr.xpath('td[last()][a="Individual data points"]'
|
||||
'/a/@href').extract()
|
||||
if extra_data_url:
|
||||
request = Request(url=self.website[:-1] + extra_data_url[0],
|
||||
request = Request(url=self.website[:-2].replace("\\", "") + extra_data_url[0],
|
||||
callback=self.parse_individual_datapoints)
|
||||
results.append(request)
|
||||
continue
|
||||
@ -155,14 +190,16 @@ class NIST(Source):
|
||||
return results
|
||||
|
||||
def parse_transition_data(self, table, summary):
|
||||
"""Parses the table containing properties regarding phase changes"""
|
||||
"""
|
||||
This function parses the table containing properties regarding phase
|
||||
changes
|
||||
:param table: a Selector object of the table to be parsed
|
||||
:param summary: the name of the property
|
||||
:return: a list of Result items
|
||||
"""
|
||||
results = []
|
||||
|
||||
tr_unit = ''.join(table.xpath('tr[1]/th[1]/node()').extract())
|
||||
m = re.search(r'\((.*)\)', tr_unit)
|
||||
unit = '!'
|
||||
if m:
|
||||
unit = m.group(1)
|
||||
unit = self.get_unit(table)
|
||||
|
||||
for tr in table.xpath('tr[td]'):
|
||||
tds = tr.xpath('td/text()').extract()
|
||||
@ -176,18 +213,18 @@ class NIST(Source):
|
||||
return results
|
||||
|
||||
def parse_generic_data(self, table, summary):
|
||||
"""Parses the common tables of 4 and 5 rows. Assumes they are of the
|
||||
"""
|
||||
Parses the common tables of 4 and 5 rows. Assumes they are of the
|
||||
form:
|
||||
Symbol (unit)|Temperature (K)|Method|Reference|Comment
|
||||
Symbol (unit)|Temperature (K)|Reference|Comment
|
||||
:param table: a Selector object of the table to be parsed
|
||||
:param summary: the name of the property
|
||||
:return: a list of Result items
|
||||
"""
|
||||
results = []
|
||||
|
||||
tr_unit = ''.join(table.xpath('tr[1]/th[1]/node()').extract())
|
||||
m = re.search(r'\((.*)\)', tr_unit)
|
||||
unit = '!'
|
||||
if m:
|
||||
unit = m.group(1)
|
||||
unit = self.get_unit(table)
|
||||
|
||||
for tr in table.xpath('tr[td]'):
|
||||
tds = tr.xpath('td/text()').extract()
|
||||
@ -200,7 +237,13 @@ class NIST(Source):
|
||||
return results
|
||||
|
||||
def parse_antoine_data(self, table, summary):
|
||||
"""Parse table containing parameters for the Antione equation"""
|
||||
"""
|
||||
This function parses the table containing parameters for the Antione
|
||||
equation
|
||||
:param table: a Selector object of the table to be parsed
|
||||
:param summary: the name of the property
|
||||
:return: a list of Result items
|
||||
"""
|
||||
results = []
|
||||
|
||||
for tr in table.xpath('tr[td]'):
|
||||
@ -215,7 +258,12 @@ class NIST(Source):
|
||||
return results
|
||||
|
||||
def parse_individual_datapoints(self, response):
|
||||
"""Parses the page linked from aggregate data"""
|
||||
"""
|
||||
This function parses the 'individual data points' page linked from
|
||||
the aggregate data table(s)
|
||||
:param response: the Scrapy Response object to be parsed
|
||||
:return: a list of Result items
|
||||
"""
|
||||
sel = Selector(response)
|
||||
table = sel.xpath('//table[@class="data"]')[0]
|
||||
|
||||
@ -228,11 +276,7 @@ class NIST(Source):
|
||||
name = m.group(1)
|
||||
condition = m.group(2)
|
||||
|
||||
tr_unit = ''.join(table.xpath('tr[1]/th[1]/node()').extract())
|
||||
m = re.search(r'\((.*)\)', tr_unit)
|
||||
unit = '!'
|
||||
if m:
|
||||
unit = m.group(1)
|
||||
unit = self.get_unit(table)
|
||||
|
||||
for tr in table.xpath('tr[td]'):
|
||||
tds = tr.xpath('td/text()').extract()
|
||||
@ -250,7 +294,25 @@ class NIST(Source):
|
||||
|
||||
return results
|
||||
|
||||
@staticmethod
|
||||
def get_unit(table):
|
||||
tr_unit = ''.join(table.xpath('tr[1]/th[1]/node()').extract())
|
||||
m = re.search(r'\((.*)\)', tr_unit)
|
||||
unit = '!'
|
||||
if m:
|
||||
unit = m.group(1)
|
||||
|
||||
return unit
|
||||
|
||||
def newresult(self, attribute, value, conditions=''):
|
||||
"""
|
||||
This function abstracts from the Result item and provides default
|
||||
values
|
||||
:param attribute: the name of the attribute
|
||||
:param value: the value of the attribute
|
||||
:param conditions: optional conditions regarding the value
|
||||
:return: A Result item
|
||||
"""
|
||||
return Result(
|
||||
{
|
||||
'attribute': attribute,
|
||||
@ -261,7 +323,12 @@ class NIST(Source):
|
||||
})
|
||||
|
||||
def new_compound_request(self, compound):
|
||||
"""
|
||||
This function is called when a new synonym is returned to the spider
|
||||
to generate new requests
|
||||
:param compound: the name of the compound to search for
|
||||
"""
|
||||
if compound not in self.ignore_list:
|
||||
self.ignore_list.update(compound)
|
||||
return Request(url=self.website[:-1] + self.search % compound,
|
||||
return Request(url=self.website[:-2].replace("\\", "") + self.search % compound,
|
||||
callback=self.parse)
|
||||
|
@ -1,9 +1,11 @@
|
||||
import re
|
||||
|
||||
from scrapy.http import Request
|
||||
from scrapy import log
|
||||
from source import Source
|
||||
from scrapy.selector import Selector
|
||||
|
||||
from source import Source
|
||||
from FourmiCrawler.items import Result
|
||||
import re
|
||||
|
||||
|
||||
class PubChem(Source):
|
||||
@ -13,10 +15,10 @@ class PubChem(Source):
|
||||
including sources of the values of properties.
|
||||
"""
|
||||
|
||||
#PubChem has its data on compound name, properties and their values on different html pages, so different URLs used
|
||||
website = 'https://*.ncbi.nlm.nih.gov/*'
|
||||
website_www = 'https://www.ncbi.nlm.nih.gov/*'
|
||||
website_pubchem = 'https://pubchem.ncbi.nlm.nih.gov/*'
|
||||
# PubChem has its data on compound name, properties and their values on different html pages, so different URLs used
|
||||
website = 'http://.*\\.ncbi\\.nlm\\.nih\\.gov/.*'
|
||||
website_www = 'http://www.ncbi.nlm.nih.gov/*'
|
||||
website_pubchem = 'http://pubchem.ncbi.nlm.nih.gov/.*'
|
||||
search = 'pccompound?term=%s'
|
||||
data_url = 'toc/summary_toc.cgi?tocid=27&cid=%s'
|
||||
|
||||
@ -49,17 +51,19 @@ class PubChem(Source):
|
||||
self._spider.get_synonym_requests(synonym)
|
||||
log.msg('Raw synonyms found: %s' % raw_synonyms, level=log.DEBUG)
|
||||
|
||||
n = re.search(r'cid=(\d+)',response.url)
|
||||
n = re.search(r'cid=(\d+)', response.url)
|
||||
if n:
|
||||
cid = n.group(1)
|
||||
log.msg('cid: %s' % cid, level=log.DEBUG) #getting the right id of the compound with which it can reach
|
||||
log.msg('cid: %s' % cid, level=log.DEBUG) # getting the right id of the compound with which it can reach
|
||||
# the seperate html page which contains the properties and their values
|
||||
|
||||
#using this cid to get the right url and scrape it
|
||||
requests.append(Request(url=self.website_pubchem[:-1] + self.data_url % cid, callback=self.parse_data))
|
||||
# using this cid to get the right url and scrape it
|
||||
requests.append(
|
||||
Request(url=self.website_pubchem[:-2].replace("\\", "") + self.data_url % cid, callback=self.parse_data))
|
||||
return requests
|
||||
|
||||
def parse_data(self, response):
|
||||
@staticmethod
|
||||
def parse_data(response):
|
||||
"""
|
||||
Parse data found in 'Chemical and Physical properties' part of a substance page.
|
||||
:param response: The response with the page to parse
|
||||
@ -80,7 +84,7 @@ class PubChem(Source):
|
||||
'attribute': prop_name,
|
||||
'value': prop_value,
|
||||
'source': prop_source,
|
||||
'reliability': 'Unknown',
|
||||
'reliability': self.cfg['reliability'],
|
||||
'conditions': ''
|
||||
})
|
||||
log.msg('PubChem prop: |%s| |%s| |%s|' %
|
||||
@ -96,7 +100,7 @@ class PubChem(Source):
|
||||
'attribute': prop_name,
|
||||
'value': prop_value,
|
||||
'source': prop_source,
|
||||
'reliability': 'Unknown',
|
||||
'reliability': self.cfg['reliability'],
|
||||
'conditions': ''
|
||||
})
|
||||
log.msg('PubChem prop: |%s| |%s| |%s|' %
|
||||
@ -106,6 +110,41 @@ class PubChem(Source):
|
||||
|
||||
return requests
|
||||
|
||||
def parse_searchrequest(self, response):
|
||||
"""
|
||||
This function parses the response to the new_compound_request Request
|
||||
:param response: the Response object to be parsed
|
||||
:return: A Request for the compound page or what self.parse returns in
|
||||
case the search request forwarded to the compound page
|
||||
"""
|
||||
|
||||
# check if pubchem forwarded straight to compound page
|
||||
m = re.match(self.website_pubchem, response.url)
|
||||
if m:
|
||||
log.msg('PubChem search forwarded to compound page',
|
||||
level=log.DEBUG)
|
||||
return self.parse(response)
|
||||
|
||||
sel = Selector(response)
|
||||
|
||||
results = sel.xpath('//div[@class="rsltcont"]')
|
||||
if results:
|
||||
url = results[0].xpath('div/p/a[1]/@href')
|
||||
else:
|
||||
log.msg('PubChem search found nothing or xpath failed',
|
||||
level=log.DEBUG)
|
||||
return None
|
||||
|
||||
if url:
|
||||
url = 'http:' + ''.join(url[0].extract())
|
||||
log.msg('PubChem compound page: %s' % url, level=log.DEBUG)
|
||||
else:
|
||||
log.msg('PubChem search found results, but no url in first result',
|
||||
level=log.DEBUG)
|
||||
return None
|
||||
|
||||
return Request(url=url, callback=self.parse)
|
||||
|
||||
def new_compound_request(self, compound):
|
||||
return Request(url=self.website_www[:-1] + self.search % compound, callback=self.parse)
|
||||
return Request(url=self.website_www[:-1] + self.search % compound,
|
||||
callback=self.parse_searchrequest)
|
||||
|
@ -15,7 +15,7 @@ class WikipediaParser(Source):
|
||||
It also returns requests with other external sources which contain information on parsed subject.
|
||||
"""
|
||||
|
||||
website = "http://en.wikipedia.org/wiki/*"
|
||||
website = "http://en\\.wikipedia\\.org/wiki/.*"
|
||||
__spider = None
|
||||
searched_compounds = []
|
||||
|
||||
@ -123,7 +123,7 @@ class WikipediaParser(Source):
|
||||
return items
|
||||
|
||||
def new_compound_request(self, compound):
|
||||
return Request(url=self.website[:-1] + compound, callback=self.parse)
|
||||
return Request(url=self.website[:-2].replace("\\", "") + compound, callback=self.parse)
|
||||
|
||||
@staticmethod
|
||||
def clean_items(items):
|
||||
|
@ -3,7 +3,7 @@ from scrapy import log
|
||||
|
||||
|
||||
class Source:
|
||||
website = "http://something/*" # Regex of URI's the source is able to parse
|
||||
website = "http://something/.*" # Regex of URI's the source is able to parse
|
||||
_spider = None
|
||||
|
||||
def __init__(self, config=None):
|
||||
@ -30,7 +30,7 @@ class Source:
|
||||
:param compound: A compound name.
|
||||
:return: A new Scrapy Request
|
||||
"""
|
||||
# return Request(url=self.website[:-1] + compound, callback=self.parse)
|
||||
# return Request(url=self.website[:-2].replace("\\", "") + compound, callback=self.parse)
|
||||
pass
|
||||
|
||||
def set_spider(self, spider):
|
||||
|
@ -34,8 +34,9 @@ class FourmiSpider(Spider):
|
||||
"""
|
||||
for source in self._sources:
|
||||
if re.match(source.website, response.url):
|
||||
log.msg("Url: " + response.url + " -> Source: " + source.website, level=log.DEBUG)
|
||||
log.msg("URL: " + response.url + " -> Source: " + source.website, level=log.DEBUG)
|
||||
return source.parse(response)
|
||||
log.msg("URL: " + response.url + " -> No compatible source", level=log.INFO)
|
||||
return None
|
||||
|
||||
def get_synonym_requests(self, compound, force=False):
|
||||
|
@ -48,7 +48,6 @@ __Main goals:__
|
||||
- Build an graphical user interface(GUI) as alternative for the command line
|
||||
interface(CLI). (Assignee: Harmen)
|
||||
- Compiling the source into an windows executable. (Assignee: Bas)
|
||||
- Create an module to gather data from PubChem. (Assignee: Nout)
|
||||
|
||||
__Side goals:__
|
||||
|
||||
|
@ -1,6 +1,6 @@
|
||||
#!/usr/bin/env python
|
||||
"""
|
||||
Fourmi, a web scraper build to search specific information for a given compound (and it's pseudonyms).
|
||||
Fourmi, a web scraper build to search specific information for a given compound (and its pseudonyms).
|
||||
|
||||
Usage:
|
||||
fourmi
|
||||
@ -18,7 +18,7 @@ Options:
|
||||
--version Show version.
|
||||
-v Verbose logging output. (Multiple occurrences increase logging level)
|
||||
--log=<file> Save log to an file.
|
||||
-o <file> --output=<file> Output file [default: results.*format*]
|
||||
-o <file> --output=<file> Output file [default: <compound>.*format*]
|
||||
-f <format> --format=<format> Output formats (supported: csv, json, jsonlines, xml) [default: csv]
|
||||
--include=<regex> Include only sources that match these regular expressions split by a comma.
|
||||
--exclude=<regex> Exclude the sources that match these regular expressions split by a comma.
|
||||
@ -61,7 +61,7 @@ def search(docopt_arguments, source_loader):
|
||||
"""
|
||||
conf = Configurator()
|
||||
conf.set_logging(docopt_arguments["--log"], docopt_arguments["-v"])
|
||||
conf.set_output(docopt_arguments["--output"], docopt_arguments["--format"])
|
||||
conf.set_output(docopt_arguments["--output"], docopt_arguments["--format"], docopt_arguments["<compound>"])
|
||||
setup_crawler(docopt_arguments["<compound>"], conf.scrapy_settings,
|
||||
source_loader, docopt_arguments["--attributes"].split(','))
|
||||
if conf.scrapy_settings.getbool("LOG_ENABLED"):
|
||||
|
19
sources.cfg.sample
Normal file
19
sources.cfg.sample
Normal file
@ -0,0 +1,19 @@
|
||||
[DEFAULT]
|
||||
reliability = Unknown
|
||||
|
||||
#For each source listed in FourmiCrawler/sources there should be a section
|
||||
#named exactly as the filename in here. If not present, the DEFAULT value is
|
||||
#used for reliability of that source.
|
||||
|
||||
[ChemSpider]
|
||||
reliability = High
|
||||
#token=Paste ChemSpider API token here and remove the hashtag
|
||||
|
||||
[NIST]
|
||||
reliability = High
|
||||
|
||||
[WikipediaParser]
|
||||
reliability = Medium
|
||||
|
||||
[PubChem]
|
||||
reliability = High
|
@ -10,16 +10,16 @@ class TestConfigurator(unittest.TestCase):
|
||||
self.conf = Configurator()
|
||||
|
||||
def test_set_output(self):
|
||||
self.conf.set_output(filename="test.txt", fileformat="csv")
|
||||
self.conf.set_output(filename="test.txt", fileformat="csv", compound="test")
|
||||
self.assertEqual(self.conf.scrapy_settings["FEED_URI"], "test.txt")
|
||||
self.assertEqual(self.conf.scrapy_settings["FEED_FORMAT"], "csv")
|
||||
|
||||
self.conf.set_output("results.*format*", "jsonlines")
|
||||
self.assertEqual(self.conf.scrapy_settings["FEED_URI"], "results.json")
|
||||
self.conf.set_output("<compound>.*format*", "jsonlines", "test")
|
||||
self.assertEqual(self.conf.scrapy_settings["FEED_URI"], "test.json")
|
||||
self.assertEqual(self.conf.scrapy_settings["FEED_FORMAT"], "jsonlines")
|
||||
|
||||
self.conf.set_output("results.*format*", "csv")
|
||||
self.assertEqual(self.conf.scrapy_settings["FEED_URI"], "results.csv")
|
||||
self.conf.set_output("<compound>.*format*", "csv", "test")
|
||||
self.assertEqual(self.conf.scrapy_settings["FEED_URI"], "test.csv")
|
||||
self.assertEqual(self.conf.scrapy_settings["FEED_FORMAT"], "csv")
|
||||
|
||||
def test_start_log(self):
|
||||
|
@ -1,4 +1,5 @@
|
||||
import ConfigParser
|
||||
import os
|
||||
|
||||
from scrapy.utils.project import get_project_settings
|
||||
|
||||
@ -12,7 +13,7 @@ class Configurator:
|
||||
def __init__(self):
|
||||
self.scrapy_settings = get_project_settings()
|
||||
|
||||
def set_output(self, filename, fileformat):
|
||||
def set_output(self, filename, fileformat, compound):
|
||||
"""
|
||||
This function manipulates the Scrapy output file settings that normally would be set in the settings file.
|
||||
In the Fourmi project these are command line arguments.
|
||||
@ -20,12 +21,12 @@ class Configurator:
|
||||
:param fileformat: The format in which the output will be.
|
||||
"""
|
||||
|
||||
if filename != 'results.*format*':
|
||||
if filename != '<compound>.*format*':
|
||||
self.scrapy_settings.overrides["FEED_URI"] = filename
|
||||
elif fileformat == "jsonlines":
|
||||
self.scrapy_settings.overrides["FEED_URI"] = "results.json"
|
||||
self.scrapy_settings.overrides["FEED_URI"] = compound + ".json"
|
||||
elif fileformat is not None:
|
||||
self.scrapy_settings.overrides["FEED_URI"] = "results." + fileformat
|
||||
self.scrapy_settings.overrides["FEED_URI"] = compound + "." + fileformat
|
||||
|
||||
if fileformat is not None:
|
||||
self.scrapy_settings.overrides["FEED_FORMAT"] = fileformat
|
||||
@ -66,8 +67,11 @@ class Configurator:
|
||||
variables for sources
|
||||
:return a ConfigParser object of sources.cfg
|
||||
"""
|
||||
current_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
config_path = current_dir + '/../sources.cfg'
|
||||
# [TODO]: location of sources.cfg should be softcoded eventually
|
||||
config = ConfigParser.ConfigParser()
|
||||
config.read('sources.cfg') # [TODO]: should be softcoded eventually
|
||||
config.read(config_path)
|
||||
return config
|
||||
|
||||
@staticmethod
|
||||
|
Reference in New Issue
Block a user