Archived
1
0

Merge branch 'develop' into feature/GUI

This commit is contained in:
Harmen Prins 2014-06-20 16:51:19 +02:00
commit 520216b528
13 changed files with 297 additions and 104 deletions

View File

@ -1,3 +1,7 @@
### v0.6.0
- FIX: Using absolute path for configuration files
- DEV: General Code cleanup in documentation
### v0.5.3 ### v0.5.3
- FIX: It is now again possible to use both verbose and the source inclusion/exclusion options - FIX: It is now again possible to use both verbose and the source inclusion/exclusion options
- FIX: Logging is now "actually" disabled if not using the verbose option. - FIX: Logging is now "actually" disabled if not using the verbose option.

View File

@ -21,7 +21,4 @@ FEED_FORMAT = 'jsonlines'
# Crawl responsibly by identifying yourself (and your website) on the # Crawl responsibly by identifying yourself (and your website) on the
# user-agent # user-agent
# [todo] - Check for repercussions on spoofing the user agent USER_AGENT = 'Fourmi'
# USER_AGENT = 'FourmiCrawler (+http://www.yourdomain.com)'
USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.137 Safari/537.36'

View File

@ -9,24 +9,28 @@ from FourmiCrawler.items import Result
# [TODO] - Maybe clean up usage of '.extract()[0]', because of possible IndexError exception. # [TODO] - Maybe clean up usage of '.extract()[0]', because of possible IndexError exception.
# [TODO] - Add checks at search request and extendedCompoundInfo on whether the token was valid or not
class ChemSpider(Source): class ChemSpider(Source):
"""ChemSpider scraper for synonyms and properties """
ChemSpider scraper for synonyms and properties
This parser will manage searching for chemicals through the This parser will manage searching for chemicals through the
ChemsSpider API, and parsing the resulting ChemSpider page. ChemsSpider API, and parsing the resulting ChemSpider page.
The token required for the API should be in a configuration file The token required for the API should be in a configuration file
somewhere. somewhere.
""" """
website = 'http://www.chemspider.com/*' website = 'http://www\\.chemspider\\.com/.*'
search = 'Search.asmx/SimpleSearch?query=%s&token=' search = 'Search.asmx/SimpleSearch?query=%s&token='
structure = 'Chemical-Structure.%s.html' structure = 'Chemical-Structure.%s.html'
extendedinfo = 'MassSpecAPI.asmx/GetExtendedCompoundInfo?csid=%s&token=' extendedinfo = 'MassSpecAPI.asmx/GetExtendedCompoundInfo?csid=%s&token='
def __init__(self, config=None): def __init__(self, config=None):
"""
Initialization of ChemSpider scraper
:param config: a dictionary of settings for this scraper, must contain
'reliability' key
"""
Source.__init__(self, config) Source.__init__(self, config)
self.ignore_list = [] self.ignore_list = []
if 'token' not in self.cfg or self.cfg['token'] == '': if 'token' not in self.cfg or self.cfg['token'] == '':
@ -37,6 +41,12 @@ class ChemSpider(Source):
self.extendedinfo += self.cfg['token'] self.extendedinfo += self.cfg['token']
def parse(self, response): def parse(self, response):
"""
This function is called when a Response matching the variable
'website' is available for parsing the Response object.
:param response: the Scrapy Response object to be parsed
:return: a list of Result items and Request objects
"""
sel = Selector(response) sel = Selector(response)
requests = [] requests = []
requests_synonyms = self.parse_synonyms(sel) requests_synonyms = self.parse_synonyms(sel)
@ -47,10 +57,26 @@ class ChemSpider(Source):
return requests return requests
def parse_properties(self, sel): def parse_properties(self, sel):
"""scrape Experimental Data and Predicted ACD/Labs tabs""" """
This function scrapes the Experimental Data and Predicted ACD/Labs tabs
:param sel: a Selector object of the whole page
:return: a list of Result items
"""
properties = []
properties.extend(self.parse_acdlabstab(sel))
properties.extend(self.parse_experimentaldatatab(sel))
return properties
def parse_acdlabstab(self, sel):
"""
This function scrapes the 'Predicted ACD/Labs tab' under Properties
:param sel: a Selector object of the whole page
:return: a list of Request objects
"""
properties = [] properties = []
# Predicted - ACD/Labs tab
td_list = sel.xpath('.//table[@id="acdlabs-table"]//td').xpath( td_list = sel.xpath('.//table[@id="acdlabs-table"]//td').xpath(
'normalize-space(string())') 'normalize-space(string())')
prop_names = td_list[::2] prop_names = td_list[::2]
@ -62,16 +88,15 @@ class ChemSpider(Source):
prop_conditions = '' prop_conditions = ''
# Test for properties without values, with one hardcoded exception # Test for properties without values, with one hardcoded exception
if not re.match(r'^\d', prop_value) or (prop_name == 'Polarizability' and prop_value == '10-24cm3'): if (not re.match(r'^\d', prop_value) or
(prop_name == 'Polarizability' and prop_value == '10-24cm3')):
continue continue
# Match for condition in parentheses
m = re.match(r'(.*) \((.*)\)', prop_name) m = re.match(r'(.*) \((.*)\)', prop_name)
if m: if m:
prop_name = m.group(1) prop_name = m.group(1)
prop_conditions = m.group(2) prop_conditions = m.group(2)
# Match for condition in value seperated by an 'at'
m = re.match(r'(.*) at (.*)', prop_value) m = re.match(r'(.*) at (.*)', prop_value)
if m: if m:
prop_value = m.group(1) prop_value = m.group(1)
@ -84,11 +109,18 @@ class ChemSpider(Source):
conditions=prop_conditions conditions=prop_conditions
) )
properties.append(new_prop) properties.append(new_prop)
log.msg('CS prop: |%s| |%s| |%s|' %
(new_prop['attribute'], new_prop['value'], new_prop['source']),
level=log.DEBUG)
# Experimental Data Tab, Physico-chemical properties in particular return properties
def parse_experimentaldatatab(self, sel):
"""
This function scrapes Experimental Data tab, Physico-chemical
properties in particular.
:param sel: a Selector object of the whole page
:return: a list of Result items
"""
properties = []
scraped_list = sel.xpath('.//li[span="Experimental Physico-chemical ' scraped_list = sel.xpath('.//li[span="Experimental Physico-chemical '
'Properties"]//li/table/tr/td') 'Properties"]//li/table/tr/td')
if not scraped_list: if not scraped_list:
@ -106,14 +138,15 @@ class ChemSpider(Source):
source=line.xpath('strong/text()').extract()[0].rstrip(), source=line.xpath('strong/text()').extract()[0].rstrip(),
) )
properties.append(new_prop) properties.append(new_prop)
log.msg('CS prop: |%s| |%s| |%s|' %
(new_prop['attribute'], new_prop['value'],
new_prop['source']), level=log.DEBUG)
return properties return properties
def parse_synonyms(self, sel): def parse_synonyms(self, sel):
"""Scrape list of Names and Identifiers""" """
This function scrapes the list of Names and Identifiers
:param sel: a Selector object of the whole page
:return: a list of Requests
"""
requests = [] requests = []
synonyms = [] synonyms = []
@ -145,7 +178,13 @@ class ChemSpider(Source):
return requests return requests
def new_synonym(self, sel, name, category): def new_synonym(self, sel, name, category):
"""Scrape for a single synonym at a given HTML tag""" """
This function scrapes for a single synonym at a given HTML tag
:param sel: a Selector object of the given HTML tag
:param name: the name of the synonym in the tag
:param category: the name of the category the synonym is labeled as
:return: a dictionary containing data on the synonym
"""
self.ignore_list.append(name) self.ignore_list.append(name)
language = sel.xpath('span[@class="synonym_language"]/text()') language = sel.xpath('span[@class="synonym_language"]/text()')
if language: if language:
@ -181,7 +220,12 @@ class ChemSpider(Source):
return synonym return synonym
def parse_extendedinfo(self, response): def parse_extendedinfo(self, response):
"""Scrape data from the ChemSpider GetExtendedCompoundInfo API""" """
This function scrapes data from the ChemSpider GetExtendedCompoundInfo
API, if a token is present in the configuration settings
:param response: a Response object to be parsed
:return: a list of Result items
"""
sel = Selector(response) sel = Selector(response)
properties = [] properties = []
names = sel.xpath('*').xpath('name()').extract() names = sel.xpath('*').xpath('name()').extract()
@ -197,8 +241,16 @@ class ChemSpider(Source):
return properties return properties
def newresult(self, attribute, value, conditions='', source='ChemSpider'): def newresult(self, attribute, value, conditions='', source='ChemSpider'):
return Result( """
{ This function abstracts from the Result item and provides default
values.
:param attribute: the name of the attribute
:param value: the value of the attribute
:param conditions: optional conditions regarding the value
:param source: the name of the source if it is not ChemSpider
:return: A Result item
"""
return Result({
'attribute': attribute, 'attribute': attribute,
'value': value, 'value': value,
'source': source, 'source': source,
@ -207,7 +259,13 @@ class ChemSpider(Source):
}) })
def parse_searchrequest(self, response): def parse_searchrequest(self, response):
"""Parse the initial response of the ChemSpider Search API """ """
This function parses the initial response of the ChemSpider Search API
Requires a valid token to function.
:param response: the Response object to be parsed
:return: A Request for the information page and a Request for the
extendedinfo API call
"""
sel = Selector(response) sel = Selector(response)
log.msg('chemspider parse_searchrequest', level=log.DEBUG) log.msg('chemspider parse_searchrequest', level=log.DEBUG)
sel.register_namespace('cs', 'http://www.chemspider.com/') sel.register_namespace('cs', 'http://www.chemspider.com/')
@ -219,8 +277,8 @@ class ChemSpider(Source):
log.msg('ChemSpider found multiple substances, taking first ' log.msg('ChemSpider found multiple substances, taking first '
'element', level=log.DEBUG) 'element', level=log.DEBUG)
csid = csids[0] csid = csids[0]
structure_url = self.website[:-1] + self.structure % csid structure_url = self.website[:-2].replace("\\", "") + self.structure % csid
extendedinfo_url = self.website[:-1] + self.extendedinfo % csid extendedinfo_url = self.website[:-2].replace("\\", "") + self.extendedinfo % csid
log.msg('chemspider URL: %s' % structure_url, level=log.DEBUG) log.msg('chemspider URL: %s' % structure_url, level=log.DEBUG)
return [Request(url=structure_url, return [Request(url=structure_url,
callback=self.parse), callback=self.parse),
@ -228,8 +286,13 @@ class ChemSpider(Source):
callback=self.parse_extendedinfo)] callback=self.parse_extendedinfo)]
def new_compound_request(self, compound): def new_compound_request(self, compound):
"""
This function is called when a new synonym is returned to the spider
to generate new requests
:param compound: the name of the compound to search for
"""
if compound in self.ignore_list or self.cfg['token'] == '': if compound in self.ignore_list or self.cfg['token'] == '':
return None return None
searchurl = self.website[:-1] + self.search % compound searchurl = self.website[:-2].replace("\\", "") + self.search % compound
log.msg('chemspider compound', level=log.DEBUG) log.msg('chemspider compound', level=log.DEBUG)
return Request(url=searchurl, callback=self.parse_searchrequest) return Request(url=searchurl, callback=self.parse_searchrequest)

View File

@ -13,20 +13,31 @@ from FourmiCrawler.items import Result
# Result item, but should be included eventually. # Result item, but should be included eventually.
class NIST(Source): class NIST(Source):
"""NIST Scraper plugin """
NIST Scraper plugin
This plugin manages searching for a chemical on the NIST website This plugin manages searching for a chemical on the NIST website
and parsing the resulting page if the chemical exists on NIST. and parsing the resulting page if the chemical exists on NIST.
""" """
website = "http://webbook.nist.gov/*" website = "http://webbook\\.nist\\.gov/.*"
search = 'cgi/cbook.cgi?Name=%s&Units=SI&cTP=on' search = 'cgi/cbook.cgi?Name=%s&Units=SI&cTP=on'
def __init__(self, config=None): def __init__(self, config=None):
"""
Initialization of NIST scraper
:param config: configuration variables for this scraper, must contain
'reliability' key.
"""
Source.__init__(self, config) Source.__init__(self, config)
self.ignore_list = set() self.ignore_list = set()
def parse(self, response): def parse(self, response):
"""
This function is called when a Response matching the variable
'website' is available for parsing the Response object.
:param response: The Scrapy Response object to be parsed
:return: a list of Result items and Request objects
"""
sel = Selector(response) sel = Selector(response)
title = sel.xpath('head/title/text()').extract()[0] title = sel.xpath('head/title/text()').extract()[0]
@ -51,6 +62,21 @@ class NIST(Source):
log.msg('NIST symbol: |%s|, name: |%s|' % (symbol, name), log.msg('NIST symbol: |%s|, name: |%s|' % (symbol, name),
level=log.DEBUG) level=log.DEBUG)
requests.extend(self.parse_tables(sel, symbol_table))
return requests
def parse_tables(self, sel, symbol_table):
"""
This function identifies and distributes parsing of tables to other
functions below.
:param sel: A Selector object of the whole page
:param symbol_table: a dictionary containing translations of raw HTML
tags to human readable names
:return: a list of Result items and Requests
"""
requests = []
for table in sel.xpath('//table[@class="data"]'): for table in sel.xpath('//table[@class="data"]'):
summary = table.xpath('@summary').extract()[0] summary = table.xpath('@summary').extract()[0]
if summary == 'One dimensional data': if summary == 'One dimensional data':
@ -81,8 +107,12 @@ class NIST(Source):
return requests return requests
def parse_generic_info(self, sel): def parse_generic_info(self, sel):
"""Parses: synonyms, chemical formula, molecular weight, InChI, """
InChiKey, CAS number This function parses: synonyms, chemical formula, molecular weight,
InChI, InChiKey, CAS number
:param sel: A Selector object of the entire page in the original
response
:return: a list of Result items
""" """
ul = sel.xpath('body/ul[li/strong="IUPAC Standard InChI:"]') ul = sel.xpath('body/ul[li/strong="IUPAC Standard InChI:"]')
@ -121,15 +151,20 @@ class NIST(Source):
return requests return requests
def parse_aggregate_data(self, table, symbol_table): def parse_aggregate_data(self, table, symbol_table):
"""Parses the table(s) which contain possible links to individual """
data points This function parses the table(s) which contain possible links to
individual data points
:param table: a Selector object of the table to be parsed
:param symbol_table: a dictionary containing translations of raw HTML
tags to human readable names
:return: a list of Result items and Request objects
""" """
results = [] results = []
for tr in table.xpath('tr[td]'): for tr in table.xpath('tr[td]'):
extra_data_url = tr.xpath('td[last()][a="Individual data points"]' extra_data_url = tr.xpath('td[last()][a="Individual data points"]'
'/a/@href').extract() '/a/@href').extract()
if extra_data_url: if extra_data_url:
request = Request(url=self.website[:-1] + extra_data_url[0], request = Request(url=self.website[:-2].replace("\\", "") + extra_data_url[0],
callback=self.parse_individual_datapoints) callback=self.parse_individual_datapoints)
results.append(request) results.append(request)
continue continue
@ -155,14 +190,16 @@ class NIST(Source):
return results return results
def parse_transition_data(self, table, summary): def parse_transition_data(self, table, summary):
"""Parses the table containing properties regarding phase changes""" """
This function parses the table containing properties regarding phase
changes
:param table: a Selector object of the table to be parsed
:param summary: the name of the property
:return: a list of Result items
"""
results = [] results = []
tr_unit = ''.join(table.xpath('tr[1]/th[1]/node()').extract()) unit = self.get_unit(table)
m = re.search(r'\((.*)\)', tr_unit)
unit = '!'
if m:
unit = m.group(1)
for tr in table.xpath('tr[td]'): for tr in table.xpath('tr[td]'):
tds = tr.xpath('td/text()').extract() tds = tr.xpath('td/text()').extract()
@ -176,18 +213,18 @@ class NIST(Source):
return results return results
def parse_generic_data(self, table, summary): def parse_generic_data(self, table, summary):
"""Parses the common tables of 4 and 5 rows. Assumes they are of the """
Parses the common tables of 4 and 5 rows. Assumes they are of the
form: form:
Symbol (unit)|Temperature (K)|Method|Reference|Comment Symbol (unit)|Temperature (K)|Method|Reference|Comment
Symbol (unit)|Temperature (K)|Reference|Comment Symbol (unit)|Temperature (K)|Reference|Comment
:param table: a Selector object of the table to be parsed
:param summary: the name of the property
:return: a list of Result items
""" """
results = [] results = []
tr_unit = ''.join(table.xpath('tr[1]/th[1]/node()').extract()) unit = self.get_unit(table)
m = re.search(r'\((.*)\)', tr_unit)
unit = '!'
if m:
unit = m.group(1)
for tr in table.xpath('tr[td]'): for tr in table.xpath('tr[td]'):
tds = tr.xpath('td/text()').extract() tds = tr.xpath('td/text()').extract()
@ -200,7 +237,13 @@ class NIST(Source):
return results return results
def parse_antoine_data(self, table, summary): def parse_antoine_data(self, table, summary):
"""Parse table containing parameters for the Antione equation""" """
This function parses the table containing parameters for the Antione
equation
:param table: a Selector object of the table to be parsed
:param summary: the name of the property
:return: a list of Result items
"""
results = [] results = []
for tr in table.xpath('tr[td]'): for tr in table.xpath('tr[td]'):
@ -215,7 +258,12 @@ class NIST(Source):
return results return results
def parse_individual_datapoints(self, response): def parse_individual_datapoints(self, response):
"""Parses the page linked from aggregate data""" """
This function parses the 'individual data points' page linked from
the aggregate data table(s)
:param response: the Scrapy Response object to be parsed
:return: a list of Result items
"""
sel = Selector(response) sel = Selector(response)
table = sel.xpath('//table[@class="data"]')[0] table = sel.xpath('//table[@class="data"]')[0]
@ -228,11 +276,7 @@ class NIST(Source):
name = m.group(1) name = m.group(1)
condition = m.group(2) condition = m.group(2)
tr_unit = ''.join(table.xpath('tr[1]/th[1]/node()').extract()) unit = self.get_unit(table)
m = re.search(r'\((.*)\)', tr_unit)
unit = '!'
if m:
unit = m.group(1)
for tr in table.xpath('tr[td]'): for tr in table.xpath('tr[td]'):
tds = tr.xpath('td/text()').extract() tds = tr.xpath('td/text()').extract()
@ -250,7 +294,25 @@ class NIST(Source):
return results return results
@staticmethod
def get_unit(table):
tr_unit = ''.join(table.xpath('tr[1]/th[1]/node()').extract())
m = re.search(r'\((.*)\)', tr_unit)
unit = '!'
if m:
unit = m.group(1)
return unit
def newresult(self, attribute, value, conditions=''): def newresult(self, attribute, value, conditions=''):
"""
This function abstracts from the Result item and provides default
values
:param attribute: the name of the attribute
:param value: the value of the attribute
:param conditions: optional conditions regarding the value
:return: A Result item
"""
return Result( return Result(
{ {
'attribute': attribute, 'attribute': attribute,
@ -261,7 +323,12 @@ class NIST(Source):
}) })
def new_compound_request(self, compound): def new_compound_request(self, compound):
"""
This function is called when a new synonym is returned to the spider
to generate new requests
:param compound: the name of the compound to search for
"""
if compound not in self.ignore_list: if compound not in self.ignore_list:
self.ignore_list.update(compound) self.ignore_list.update(compound)
return Request(url=self.website[:-1] + self.search % compound, return Request(url=self.website[:-2].replace("\\", "") + self.search % compound,
callback=self.parse) callback=self.parse)

View File

@ -1,9 +1,11 @@
import re
from scrapy.http import Request from scrapy.http import Request
from scrapy import log from scrapy import log
from source import Source
from scrapy.selector import Selector from scrapy.selector import Selector
from source import Source
from FourmiCrawler.items import Result from FourmiCrawler.items import Result
import re
class PubChem(Source): class PubChem(Source):
@ -14,9 +16,9 @@ class PubChem(Source):
""" """
# PubChem has its data on compound name, properties and their values on different html pages, so different URLs used # PubChem has its data on compound name, properties and their values on different html pages, so different URLs used
website = 'https://*.ncbi.nlm.nih.gov/*' website = 'http://.*\\.ncbi\\.nlm\\.nih\\.gov/.*'
website_www = 'https://www.ncbi.nlm.nih.gov/*' website_www = 'http://www.ncbi.nlm.nih.gov/*'
website_pubchem = 'https://pubchem.ncbi.nlm.nih.gov/*' website_pubchem = 'http://pubchem.ncbi.nlm.nih.gov/.*'
search = 'pccompound?term=%s' search = 'pccompound?term=%s'
data_url = 'toc/summary_toc.cgi?tocid=27&cid=%s' data_url = 'toc/summary_toc.cgi?tocid=27&cid=%s'
@ -56,10 +58,12 @@ class PubChem(Source):
# the seperate html page which contains the properties and their values # the seperate html page which contains the properties and their values
# using this cid to get the right url and scrape it # using this cid to get the right url and scrape it
requests.append(Request(url=self.website_pubchem[:-1] + self.data_url % cid, callback=self.parse_data)) requests.append(
Request(url=self.website_pubchem[:-2].replace("\\", "") + self.data_url % cid, callback=self.parse_data))
return requests return requests
def parse_data(self, response): @staticmethod
def parse_data(response):
""" """
Parse data found in 'Chemical and Physical properties' part of a substance page. Parse data found in 'Chemical and Physical properties' part of a substance page.
:param response: The response with the page to parse :param response: The response with the page to parse
@ -80,7 +84,7 @@ class PubChem(Source):
'attribute': prop_name, 'attribute': prop_name,
'value': prop_value, 'value': prop_value,
'source': prop_source, 'source': prop_source,
'reliability': 'Unknown', 'reliability': self.cfg['reliability'],
'conditions': '' 'conditions': ''
}) })
log.msg('PubChem prop: |%s| |%s| |%s|' % log.msg('PubChem prop: |%s| |%s| |%s|' %
@ -96,7 +100,7 @@ class PubChem(Source):
'attribute': prop_name, 'attribute': prop_name,
'value': prop_value, 'value': prop_value,
'source': prop_source, 'source': prop_source,
'reliability': 'Unknown', 'reliability': self.cfg['reliability'],
'conditions': '' 'conditions': ''
}) })
log.msg('PubChem prop: |%s| |%s| |%s|' % log.msg('PubChem prop: |%s| |%s| |%s|' %
@ -106,6 +110,41 @@ class PubChem(Source):
return requests return requests
def parse_searchrequest(self, response):
"""
This function parses the response to the new_compound_request Request
:param response: the Response object to be parsed
:return: A Request for the compound page or what self.parse returns in
case the search request forwarded to the compound page
"""
# check if pubchem forwarded straight to compound page
m = re.match(self.website_pubchem, response.url)
if m:
log.msg('PubChem search forwarded to compound page',
level=log.DEBUG)
return self.parse(response)
sel = Selector(response)
results = sel.xpath('//div[@class="rsltcont"]')
if results:
url = results[0].xpath('div/p/a[1]/@href')
else:
log.msg('PubChem search found nothing or xpath failed',
level=log.DEBUG)
return None
if url:
url = 'http:' + ''.join(url[0].extract())
log.msg('PubChem compound page: %s' % url, level=log.DEBUG)
else:
log.msg('PubChem search found results, but no url in first result',
level=log.DEBUG)
return None
return Request(url=url, callback=self.parse)
def new_compound_request(self, compound): def new_compound_request(self, compound):
return Request(url=self.website_www[:-1] + self.search % compound, callback=self.parse) return Request(url=self.website_www[:-1] + self.search % compound,
callback=self.parse_searchrequest)

View File

@ -15,7 +15,7 @@ class WikipediaParser(Source):
It also returns requests with other external sources which contain information on parsed subject. It also returns requests with other external sources which contain information on parsed subject.
""" """
website = "http://en.wikipedia.org/wiki/*" website = "http://en\\.wikipedia\\.org/wiki/.*"
__spider = None __spider = None
searched_compounds = [] searched_compounds = []
@ -123,7 +123,7 @@ class WikipediaParser(Source):
return items return items
def new_compound_request(self, compound): def new_compound_request(self, compound):
return Request(url=self.website[:-1] + compound, callback=self.parse) return Request(url=self.website[:-2].replace("\\", "") + compound, callback=self.parse)
@staticmethod @staticmethod
def clean_items(items): def clean_items(items):

View File

@ -3,7 +3,7 @@ from scrapy import log
class Source: class Source:
website = "http://something/*" # Regex of URI's the source is able to parse website = "http://something/.*" # Regex of URI's the source is able to parse
_spider = None _spider = None
def __init__(self, config=None): def __init__(self, config=None):
@ -30,7 +30,7 @@ class Source:
:param compound: A compound name. :param compound: A compound name.
:return: A new Scrapy Request :return: A new Scrapy Request
""" """
# return Request(url=self.website[:-1] + compound, callback=self.parse) # return Request(url=self.website[:-2].replace("\\", "") + compound, callback=self.parse)
pass pass
def set_spider(self, spider): def set_spider(self, spider):

View File

@ -34,8 +34,9 @@ class FourmiSpider(Spider):
""" """
for source in self._sources: for source in self._sources:
if re.match(source.website, response.url): if re.match(source.website, response.url):
log.msg("Url: " + response.url + " -> Source: " + source.website, level=log.DEBUG) log.msg("URL: " + response.url + " -> Source: " + source.website, level=log.DEBUG)
return source.parse(response) return source.parse(response)
log.msg("URL: " + response.url + " -> No compatible source", level=log.INFO)
return None return None
def get_synonym_requests(self, compound, force=False): def get_synonym_requests(self, compound, force=False):

View File

@ -48,7 +48,6 @@ __Main goals:__
- Build an graphical user interface(GUI) as alternative for the command line - Build an graphical user interface(GUI) as alternative for the command line
interface(CLI). (Assignee: Harmen) interface(CLI). (Assignee: Harmen)
- Compiling the source into an windows executable. (Assignee: Bas) - Compiling the source into an windows executable. (Assignee: Bas)
- Create an module to gather data from PubChem. (Assignee: Nout)
__Side goals:__ __Side goals:__

View File

@ -1,6 +1,6 @@
#!/usr/bin/env python #!/usr/bin/env python
""" """
Fourmi, a web scraper build to search specific information for a given compound (and it's pseudonyms). Fourmi, a web scraper build to search specific information for a given compound (and its pseudonyms).
Usage: Usage:
fourmi fourmi
@ -18,7 +18,7 @@ Options:
--version Show version. --version Show version.
-v Verbose logging output. (Multiple occurrences increase logging level) -v Verbose logging output. (Multiple occurrences increase logging level)
--log=<file> Save log to an file. --log=<file> Save log to an file.
-o <file> --output=<file> Output file [default: results.*format*] -o <file> --output=<file> Output file [default: <compound>.*format*]
-f <format> --format=<format> Output formats (supported: csv, json, jsonlines, xml) [default: csv] -f <format> --format=<format> Output formats (supported: csv, json, jsonlines, xml) [default: csv]
--include=<regex> Include only sources that match these regular expressions split by a comma. --include=<regex> Include only sources that match these regular expressions split by a comma.
--exclude=<regex> Exclude the sources that match these regular expressions split by a comma. --exclude=<regex> Exclude the sources that match these regular expressions split by a comma.
@ -61,7 +61,7 @@ def search(docopt_arguments, source_loader):
""" """
conf = Configurator() conf = Configurator()
conf.set_logging(docopt_arguments["--log"], docopt_arguments["-v"]) conf.set_logging(docopt_arguments["--log"], docopt_arguments["-v"])
conf.set_output(docopt_arguments["--output"], docopt_arguments["--format"]) conf.set_output(docopt_arguments["--output"], docopt_arguments["--format"], docopt_arguments["<compound>"])
setup_crawler(docopt_arguments["<compound>"], conf.scrapy_settings, setup_crawler(docopt_arguments["<compound>"], conf.scrapy_settings,
source_loader, docopt_arguments["--attributes"].split(',')) source_loader, docopt_arguments["--attributes"].split(','))
if conf.scrapy_settings.getbool("LOG_ENABLED"): if conf.scrapy_settings.getbool("LOG_ENABLED"):

19
sources.cfg.sample Normal file
View File

@ -0,0 +1,19 @@
[DEFAULT]
reliability = Unknown
#For each source listed in FourmiCrawler/sources there should be a section
#named exactly as the filename in here. If not present, the DEFAULT value is
#used for reliability of that source.
[ChemSpider]
reliability = High
#token=Paste ChemSpider API token here and remove the hashtag
[NIST]
reliability = High
[WikipediaParser]
reliability = Medium
[PubChem]
reliability = High

View File

@ -10,16 +10,16 @@ class TestConfigurator(unittest.TestCase):
self.conf = Configurator() self.conf = Configurator()
def test_set_output(self): def test_set_output(self):
self.conf.set_output(filename="test.txt", fileformat="csv") self.conf.set_output(filename="test.txt", fileformat="csv", compound="test")
self.assertEqual(self.conf.scrapy_settings["FEED_URI"], "test.txt") self.assertEqual(self.conf.scrapy_settings["FEED_URI"], "test.txt")
self.assertEqual(self.conf.scrapy_settings["FEED_FORMAT"], "csv") self.assertEqual(self.conf.scrapy_settings["FEED_FORMAT"], "csv")
self.conf.set_output("results.*format*", "jsonlines") self.conf.set_output("<compound>.*format*", "jsonlines", "test")
self.assertEqual(self.conf.scrapy_settings["FEED_URI"], "results.json") self.assertEqual(self.conf.scrapy_settings["FEED_URI"], "test.json")
self.assertEqual(self.conf.scrapy_settings["FEED_FORMAT"], "jsonlines") self.assertEqual(self.conf.scrapy_settings["FEED_FORMAT"], "jsonlines")
self.conf.set_output("results.*format*", "csv") self.conf.set_output("<compound>.*format*", "csv", "test")
self.assertEqual(self.conf.scrapy_settings["FEED_URI"], "results.csv") self.assertEqual(self.conf.scrapy_settings["FEED_URI"], "test.csv")
self.assertEqual(self.conf.scrapy_settings["FEED_FORMAT"], "csv") self.assertEqual(self.conf.scrapy_settings["FEED_FORMAT"], "csv")
def test_start_log(self): def test_start_log(self):

View File

@ -1,4 +1,5 @@
import ConfigParser import ConfigParser
import os
from scrapy.utils.project import get_project_settings from scrapy.utils.project import get_project_settings
@ -12,7 +13,7 @@ class Configurator:
def __init__(self): def __init__(self):
self.scrapy_settings = get_project_settings() self.scrapy_settings = get_project_settings()
def set_output(self, filename, fileformat): def set_output(self, filename, fileformat, compound):
""" """
This function manipulates the Scrapy output file settings that normally would be set in the settings file. This function manipulates the Scrapy output file settings that normally would be set in the settings file.
In the Fourmi project these are command line arguments. In the Fourmi project these are command line arguments.
@ -20,12 +21,12 @@ class Configurator:
:param fileformat: The format in which the output will be. :param fileformat: The format in which the output will be.
""" """
if filename != 'results.*format*': if filename != '<compound>.*format*':
self.scrapy_settings.overrides["FEED_URI"] = filename self.scrapy_settings.overrides["FEED_URI"] = filename
elif fileformat == "jsonlines": elif fileformat == "jsonlines":
self.scrapy_settings.overrides["FEED_URI"] = "results.json" self.scrapy_settings.overrides["FEED_URI"] = compound + ".json"
elif fileformat is not None: elif fileformat is not None:
self.scrapy_settings.overrides["FEED_URI"] = "results." + fileformat self.scrapy_settings.overrides["FEED_URI"] = compound + "." + fileformat
if fileformat is not None: if fileformat is not None:
self.scrapy_settings.overrides["FEED_FORMAT"] = fileformat self.scrapy_settings.overrides["FEED_FORMAT"] = fileformat
@ -66,8 +67,11 @@ class Configurator:
variables for sources variables for sources
:return a ConfigParser object of sources.cfg :return a ConfigParser object of sources.cfg
""" """
current_dir = os.path.dirname(os.path.abspath(__file__))
config_path = current_dir + '/../sources.cfg'
# [TODO]: location of sources.cfg should be softcoded eventually
config = ConfigParser.ConfigParser() config = ConfigParser.ConfigParser()
config.read('sources.cfg') # [TODO]: should be softcoded eventually config.read(config_path)
return config return config
@staticmethod @staticmethod