Archived
1
0

Merge branch 'release/v0.5.0'

This commit is contained in:
Jip J. Dekker 2014-06-08 22:39:53 +02:00
commit 16248577b0
15 changed files with 276 additions and 168 deletions

3
.gitignore vendored
View File

@ -4,6 +4,9 @@
#Python Specific ignores #Python Specific ignores
*.pyc *.pyc
#may contain authentication information
sources.cfg
#THINGS WE WOULD NEVER EVER WANT! #THINGS WE WOULD NEVER EVER WANT!
#ignore thumbnails created by windows #ignore thumbnails created by windows
Thumbs.db Thumbs.db

View File

@ -10,7 +10,7 @@ install:
# command to run tests, e.g. python setup.py test # command to run tests, e.g. python setup.py test
script: script:
- nosetests --with-coverage --cover-package=FourmiCrawler tests - nosetests --with-coverage --cover-package=FourmiCrawler,utils tests
notifications: notifications:
slack: descartes2:6sgCzx3PvrO9IIMwKxj12dDM slack: descartes2:6sgCzx3PvrO9IIMwKxj12dDM

View File

@ -9,7 +9,7 @@ from FourmiCrawler.items import Result
# [TODO] - Maybe clean up usage of '.extract()[0]', because of possible IndexError exception. # [TODO] - Maybe clean up usage of '.extract()[0]', because of possible IndexError exception.
# [TODO] - Add checks at search request and extendedCompoundInfo on whether the token was valid or not
class ChemSpider(Source): class ChemSpider(Source):
"""ChemSpider scraper for synonyms and properties """ChemSpider scraper for synonyms and properties
@ -20,19 +20,23 @@ class ChemSpider(Source):
somewhere. somewhere.
""" """
def __init__(self):
Source.__init__(self)
website = 'http://www.chemspider.com/*' website = 'http://www.chemspider.com/*'
# [TODO] - Save and access token of specific user. search = 'Search.asmx/SimpleSearch?query=%s&token='
search = ('Search.asmx/SimpleSearch?query=%s&token='
'052bfd06-5ce4-43d6-bf12-89eabefd2338')
structure = 'Chemical-Structure.%s.html' structure = 'Chemical-Structure.%s.html'
extendedinfo = ('MassSpecAPI.asmx/GetExtendedCompoundInfo?csid=%s&token=' extendedinfo = 'MassSpecAPI.asmx/GetExtendedCompoundInfo?csid=%s&token='
'052bfd06-5ce4-43d6-bf12-89eabefd2338')
def __init__(self, config={}):
Source.__init__(self, config)
self.cfg = config
self.ignore_list = []
if 'token' not in self.cfg or self.cfg['token'] == '':
log.msg('ChemSpider token not set or empty, search/MassSpec API '
'not available', level=log.WARNING)
self.cfg['token'] = ''
self.search += self.cfg['token']
self.extendedinfo += self.cfg['token']
ignore_list = []
def parse(self, response): def parse(self, response):
sel = Selector(response) sel = Selector(response)
@ -44,8 +48,7 @@ class ChemSpider(Source):
return requests return requests
@staticmethod def parse_properties(self, sel):
def parse_properties(sel):
"""scrape Experimental Data and Predicted ACD/Labs tabs""" """scrape Experimental Data and Predicted ACD/Labs tabs"""
properties = [] properties = []
@ -76,13 +79,12 @@ class ChemSpider(Source):
prop_value = m.group(1) prop_value = m.group(1)
prop_conditions = m.group(2) prop_conditions = m.group(2)
new_prop = Result({ new_prop = self.newresult(
'attribute': prop_name, attribute=prop_name,
'value': prop_value, value=prop_value,
'source': 'ChemSpider Predicted - ACD/Labs Tab', source='ChemSpider Predicted - ACD/Labs Tab',
'reliability': 'Unknown', conditions=prop_conditions
'conditions': prop_conditions )
})
properties.append(new_prop) properties.append(new_prop)
log.msg('CS prop: |%s| |%s| |%s|' % log.msg('CS prop: |%s| |%s| |%s|' %
(new_prop['attribute'], new_prop['value'], new_prop['source']), (new_prop['attribute'], new_prop['value'], new_prop['source']),
@ -100,14 +102,11 @@ class ChemSpider(Source):
if line.xpath('span/text()'): if line.xpath('span/text()'):
property_name = line.xpath('span/text()').extract()[0].rstrip() property_name = line.xpath('span/text()').extract()[0].rstrip()
else: else:
new_prop = Result({ new_prop = self.newresult(
'attribute': property_name[:-1], attribute=property_name[:-1],
'value': line.xpath('text()').extract()[0].rstrip(), value=line.xpath('text()').extract()[0].rstrip(),
'source': line.xpath( source=line.xpath('strong/text()').extract()[0].rstrip(),
'strong/text()').extract()[0].rstrip(), )
'reliability': 'Unknown',
'conditions': ''
})
properties.append(new_prop) properties.append(new_prop)
log.msg('CS prop: |%s| |%s| |%s|' % log.msg('CS prop: |%s| |%s| |%s|' %
(new_prop['attribute'], new_prop['value'], (new_prop['attribute'], new_prop['value'],
@ -183,25 +182,31 @@ class ChemSpider(Source):
} }
return synonym return synonym
@staticmethod def parse_extendedinfo(self, response):
def parse_extendedinfo(response):
"""Scrape data from the ChemSpider GetExtendedCompoundInfo API""" """Scrape data from the ChemSpider GetExtendedCompoundInfo API"""
sel = Selector(response) sel = Selector(response)
properties = [] properties = []
names = sel.xpath('*').xpath('name()').extract() names = sel.xpath('*').xpath('name()').extract()
values = sel.xpath('*').xpath('text()').extract() values = sel.xpath('*').xpath('text()').extract()
for (name, value) in zip(names, values): for (name, value) in zip(names, values):
result = Result({ result = self.newresult(
'attribute': name, attribute=name,
'value': value, # These values have no unit! value=value, # These values have no unit!
'source': 'ChemSpider ExtendedCompoundInfo', source='ChemSpider ExtendedCompoundInfo',
'reliability': 'Unknown', )
'conditions': ''
})
if result['value']: if result['value']:
properties.append(result) properties.append(result)
return properties return properties
def newresult(self, attribute, value, conditions='', source='ChemSpider'):
return Result({
'attribute': attribute,
'value': value,
'source': source,
'reliability': self.cfg['reliability'],
'conditions': conditions
})
def parse_searchrequest(self, response): def parse_searchrequest(self, response):
"""Parse the initial response of the ChemSpider Search API """ """Parse the initial response of the ChemSpider Search API """
sel = Selector(response) sel = Selector(response)
@ -224,7 +229,7 @@ class ChemSpider(Source):
callback=self.parse_extendedinfo)] callback=self.parse_extendedinfo)]
def new_compound_request(self, compound): def new_compound_request(self, compound):
if compound in self.ignore_list: # [TODO] - add regular expression if compound in self.ignore_list or self.cfg['token'] == '':
return None return None
searchurl = self.website[:-1] + self.search % compound searchurl = self.website[:-1] + self.search % compound
log.msg('chemspider compound', level=log.DEBUG) log.msg('chemspider compound', level=log.DEBUG)

View File

@ -22,10 +22,12 @@ class NIST(Source):
search = 'cgi/cbook.cgi?Name=%s&Units=SI&cTP=on' search = 'cgi/cbook.cgi?Name=%s&Units=SI&cTP=on'
ignore_list = set() cfg = {}
def __init__(self): def __init__(self, config={}):
Source.__init__(self) Source.__init__(self, config)
self.ignore_list = set()
self.cfg = config
def parse(self, response): def parse(self, response):
sel = Selector(response) sel = Selector(response)
@ -114,13 +116,10 @@ class NIST(Source):
requests = [] requests = []
for key, value in data.iteritems(): for key, value in data.iteritems():
result = Result({ result = self.newresult(
'attribute': key, attribute=key,
'value': value, value=value
'source': 'NIST', )
'reliability': 'Unknown',
'conditions': ''
})
requests.append(result) requests.append(result)
return requests return requests
@ -150,19 +149,16 @@ class NIST(Source):
name = m.group(1) name = m.group(1)
condition = m.group(2) condition = m.group(2)
result = Result({ result = self.newresult(
'attribute': name, attribute=name,
'value': data[1] + ' ' + data[2], value=data[1] + ' ' + data[2],
'source': 'NIST', conditions=condition
'reliability': 'Unknown', )
'conditions': condition
})
log.msg('NIST: |%s|' % data, level=log.DEBUG) log.msg('NIST: |%s|' % data, level=log.DEBUG)
results.append(result) results.append(result)
return results return results
@staticmethod def parse_transition_data(self, table, summary):
def parse_transition_data(table, summary):
"""Parses the table containing properties regarding phase changes""" """Parses the table containing properties regarding phase changes"""
results = [] results = []
@ -174,19 +170,16 @@ class NIST(Source):
for tr in table.xpath('tr[td]'): for tr in table.xpath('tr[td]'):
tds = tr.xpath('td/text()').extract() tds = tr.xpath('td/text()').extract()
result = Result({ result = self.newresult(
'attribute': summary, attribute=summary,
'value': tds[0] + ' ' + unit, value=tds[0] + ' ' + unit,
'source': 'NIST', conditions='%s K, (%s -> %s)' % (tds[1], tds[2], tds[3])
'reliability': 'Unknown', )
'conditions': '%s K, (%s -> %s)' % (tds[1], tds[2], tds[3])
})
results.append(result) results.append(result)
return results return results
@staticmethod def parse_generic_data(self, table, summary):
def parse_generic_data(table, summary):
"""Parses the common tables of 4 and 5 rows. Assumes they are of the """Parses the common tables of 4 and 5 rows. Assumes they are of the
form: form:
Symbol (unit)|Temperature (K)|Method|Reference|Comment Symbol (unit)|Temperature (K)|Method|Reference|Comment
@ -202,36 +195,30 @@ class NIST(Source):
for tr in table.xpath('tr[td]'): for tr in table.xpath('tr[td]'):
tds = tr.xpath('td/text()').extract() tds = tr.xpath('td/text()').extract()
result = Result({ result = self.newresult(
'attribute': summary, attribute=summary,
'value': tds[0] + ' ' + unit, value=tds[0] + ' ' + unit,
'source': 'NIST', conditions='%s K' % tds[1]
'reliability': 'Unknown', )
'conditions': '%s K' % tds[1]
})
results.append(result) results.append(result)
return results return results
@staticmethod def parse_antoine_data(self, table, summary):
def parse_antoine_data(table, summary):
"""Parse table containing parameters for the Antione equation""" """Parse table containing parameters for the Antione equation"""
results = [] results = []
for tr in table.xpath('tr[td]'): for tr in table.xpath('tr[td]'):
tds = tr.xpath('td/text()').extract() tds = tr.xpath('td/text()').extract()
result = Result({ result = self.newresult(
'attribute': summary, attribute=summary,
'value': 'A=%s, B=%s, C=%s' % (tds[1], tds[2], tds[3]), value='A=%s, B=%s, C=%s' % (tds[1], tds[2], tds[3]),
'source': 'NIST', conditions='%s K' % tds[0]
'reliability': 'Unknown', )
'conditions': '%s K' % tds[0]
})
results.append(result) results.append(result)
return results return results
@staticmethod def parse_individual_datapoints(self, response):
def parse_individual_datapoints(response):
"""Parses the page linked from aggregate data""" """Parses the page linked from aggregate data"""
sel = Selector(response) sel = Selector(response)
table = sel.xpath('//table[@class="data"]')[0] table = sel.xpath('//table[@class="data"]')[0]
@ -258,17 +245,24 @@ class NIST(Source):
if m: if m:
uncertainty = '+- %s ' % m.group(1) uncertainty = '+- %s ' % m.group(1)
# [TODO]: get the plusminus sign working in here # [TODO]: get the plusminus sign working in here
result = Result({ result = self.newresult(
'attribute': name, attribute=name,
'value': '%s %s%s' % (tds[0], uncertainty, unit), value='%s %s%s' % (tds[0], uncertainty, unit),
'source': 'NIST', conditions=condition
'reliability': 'Unknown', )
'conditions': condition
})
results.append(result) results.append(result)
return results return results
def newresult(self, attribute, value, conditions=''):
return Result({
'attribute': attribute,
'value': value,
'source': 'NIST',
'reliability': self.cfg['reliability'],
'conditions': conditions
})
def new_compound_request(self, compound): def new_compound_request(self, compound):
if compound not in self.ignore_list: if compound not in self.ignore_list:
self.ignore_list.update(compound) self.ignore_list.update(compound)

View File

@ -19,8 +19,11 @@ class WikipediaParser(Source):
__spider = None __spider = None
searched_compounds = [] searched_compounds = []
def __init__(self): cfg = {}
Source.__init__(self)
def __init__(self, config={}):
Source.__init__(self, config)
self.cfg = config
def parse(self, response): def parse(self, response):
""" Distributes the above described behaviour """ """ Distributes the above described behaviour """
@ -44,13 +47,10 @@ class WikipediaParser(Source):
prop_names = tr_list[::2] prop_names = tr_list[::2]
prop_values = tr_list[1::2] prop_values = tr_list[1::2]
for i, prop_name in enumerate(prop_names): for i, prop_name in enumerate(prop_names):
item = Result({ item = self.newresult(
'attribute': prop_name.extract().encode('utf-8'), attribute=prop_name.extract().encode('utf-8'),
'value': prop_values[i].extract().encode('utf-8'), value=prop_values[i].extract().encode('utf-8')
'source': "Wikipedia", )
'reliability': "Unknown",
'conditions': ""
})
items.append(item) items.append(item)
log.msg('Wiki prop: |%s| |%s| |%s|' % (item['attribute'], item['value'], item['source']), level=log.DEBUG) log.msg('Wiki prop: |%s| |%s| |%s|' % (item['attribute'], item['value'], item['source']), level=log.DEBUG)
@ -61,13 +61,10 @@ class WikipediaParser(Source):
log.msg('item: %s' % tablerow.xpath('./th').xpath('normalize-space(string())'), level=log.DEBUG) log.msg('item: %s' % tablerow.xpath('./th').xpath('normalize-space(string())'), level=log.DEBUG)
if tablerow.xpath('./th').xpath('normalize-space(string())') and tablerow.xpath('./td').xpath( if tablerow.xpath('./th').xpath('normalize-space(string())') and tablerow.xpath('./td').xpath(
'normalize-space(string())'): 'normalize-space(string())'):
item = Result({ item = self.newresult(
'attribute': tablerow.xpath('./th').xpath('normalize-space(string())').extract()[0].encode('utf-8'), attribute=tablerow.xpath('./th').xpath('normalize-space(string())').extract()[0].encode('utf-8'),
'value': tablerow.xpath('./td').xpath('normalize-space(string())').extract()[0].encode('utf-8'), value=tablerow.xpath('./td').xpath('normalize-space(string())').extract()[0].encode('utf-8'),
'source': "Wikipedia", )
'reliability': "Unknown",
'conditions': ""
})
items.append(item) items.append(item)
log.msg( log.msg(
'Wiki prop: |attribute: %s| |value: %s| |%s|' % (item['attribute'], item['value'], item['source']), 'Wiki prop: |attribute: %s| |value: %s| |%s|' % (item['attribute'], item['value'], item['source']),
@ -116,4 +113,13 @@ class WikipediaParser(Source):
""" find external links, named 'Identifiers' to different sources. """ """ find external links, named 'Identifiers' to different sources. """
links = sel.xpath('//span[contains(concat(" ",normalize-space(@class)," "),"reflink")]/a' links = sel.xpath('//span[contains(concat(" ",normalize-space(@class)," "),"reflink")]/a'
'[contains(concat(" ",normalize-space(@class)," "),"external")]/@href').extract() '[contains(concat(" ",normalize-space(@class)," "),"external")]/@href').extract()
return links return links
def newresult(self, attribute, value):
return Result({
'attribute': attribute,
'value': value,
'source': 'Wikipedia',
'reliability': self.cfg['reliability'],
'conditions': ''
})

View File

@ -6,7 +6,7 @@ class Source:
website = "http://something/*" # Regex of URI's the source is able to parse website = "http://something/*" # Regex of URI's the source is able to parse
_spider = None _spider = None
def __init__(self): def __init__(self, config={}):
""" """
Initiation of a new Source Initiation of a new Source
""" """

View File

@ -9,8 +9,6 @@ class FourmiSpider(Spider):
A spider writen for the Fourmi Project which calls upon all available sources to request and scrape data. A spider writen for the Fourmi Project which calls upon all available sources to request and scrape data.
""" """
name = "FourmiSpider" name = "FourmiSpider"
_sources = []
synonyms = set()
def __init__(self, compound=None, selected_attributes=[".*"], *args, **kwargs): def __init__(self, compound=None, selected_attributes=[".*"], *args, **kwargs):
""" """
@ -18,6 +16,8 @@ class FourmiSpider(Spider):
:param compound: compound that will be searched. :param compound: compound that will be searched.
:param selected_attributes: A list of regular expressions that the attributes should match. :param selected_attributes: A list of regular expressions that the attributes should match.
""" """
self._sources = []
self.synonyms = set()
super(FourmiSpider, self).__init__(*args, **kwargs) super(FourmiSpider, self).__init__(*args, **kwargs)
self.synonyms.add(compound) self.synonyms.add(compound)
self.selected_attributes = selected_attributes self.selected_attributes = selected_attributes

View File

@ -1,8 +1,8 @@
# Fourmi # Fourmi
**Master branch**: [![Build Status](https://travis-ci.org/Recondor/Fourmi.svg?branch=master)](https://travis-ci.org/Recondor/Fourmi) **Master branch**: [![Build Status](https://travis-ci.org/jjdekker/Fourmi.svg?branch=master)](https://travis-ci.org/jjdekker/Fourmi) [![Coverage Status](https://img.shields.io/coveralls/jjdekker/Fourmi.svg)](https://coveralls.io/r/jjdekker/Fourmi?branch=master)
**Developing branch**: [![Build Status](https://travis-ci.org/Recondor/Fourmi.svg?branch=develop)](https://travis-ci.org/Recondor/Fourmi) **Developing branch**: [![Build Status](https://travis-ci.org/jjdekker/Fourmi.svg?branch=develop)](https://travis-ci.org/jjdekker/Fourmi) [![Coverage Status](https://img.shields.io/coveralls/jjdekker/Fourmi.svg)](https://coveralls.io/r/jjdekker/Fourmi?branch=develop)
Fourmi is an web scraper for chemical substances. The program is designed to be Fourmi is an web scraper for chemical substances. The program is designed to be
used as a search engine to search multiple chemical databases for a specific used as a search engine to search multiple chemical databases for a specific

View File

@ -17,8 +17,8 @@ Options:
--version Show version. --version Show version.
--verbose Verbose logging output. --verbose Verbose logging output.
--log=<file> Save log to an file. --log=<file> Save log to an file.
-o <file> --output=<file> Output file [default: result.*format*] -o <file> --output=<file> Output file [default: results.*format*]
-f <format> --format=<format> Output formats (supported: csv, json, jsonlines, xml) [default: jsonlines] -f <format> --format=<format> Output formats (supported: csv, json, jsonlines, xml) [default: csv]
--include=<regex> Include only sources that match these regular expressions split by a comma. --include=<regex> Include only sources that match these regular expressions split by a comma.
--exclude=<regex> Exclude the sources that match these regular expressions split by a comma. --exclude=<regex> Exclude the sources that match these regular expressions split by a comma.
""" """
@ -30,7 +30,8 @@ from scrapy.utils.project import get_project_settings
import docopt import docopt
from FourmiCrawler.spider import FourmiSpider from FourmiCrawler.spider import FourmiSpider
from sourceloader import SourceLoader from utils.configurator import Configurator
from utils.sourceloader import SourceLoader
def setup_crawler(compound, settings, source_loader, attributes): def setup_crawler(compound, settings, source_loader, attributes):
@ -50,59 +51,22 @@ def setup_crawler(compound, settings, source_loader, attributes):
crawler.start() crawler.start()
def scrapy_settings_manipulation(docopt_arguments):
"""
This function manipulates the Scrapy settings that normally would be set in the settings file. In the Fourmi
project these are command line arguments.
:param docopt_arguments: A dictionary generated by docopt containing all CLI arguments.
"""
settings = get_project_settings()
if docopt_arguments["--output"] != 'result.*format*':
settings.overrides["FEED_URI"] = docopt_arguments["--output"]
elif docopt_arguments["--format"] == "jsonlines":
settings.overrides["FEED_URI"] = "results.json"
elif docopt_arguments["--format"] is not None:
settings.overrides["FEED_URI"] = "results." + docopt_arguments["--format"]
if docopt_arguments["--format"] is not None:
settings.overrides["FEED_FORMAT"] = docopt_arguments["--format"]
return settings
def start_log(docopt_arguments):
"""
This function starts the logging functionality of Scrapy using the settings given by the CLI.
:param docopt_arguments: A dictionary generated by docopt containing all CLI arguments.
"""
if docopt_arguments["--log"] is not None:
if docopt_arguments["--verbose"]:
log.start(logfile=docopt_arguments["--log"], logstdout=False, loglevel=log.DEBUG)
else:
log.start(logfile=docopt_arguments["--log"], logstdout=True, loglevel=log.WARNING)
else:
if docopt_arguments["--verbose"]:
log.start(logstdout=False, loglevel=log.DEBUG)
else:
log.start(logstdout=True, loglevel=log.WARNING)
def search(docopt_arguments, source_loader): def search(docopt_arguments, source_loader):
""" """
The function that facilitates the search for a specific compound. The function that facilitates the search for a specific compound.
:param docopt_arguments: A dictionary generated by docopt containing all CLI arguments. :param docopt_arguments: A dictionary generated by docopt containing all CLI arguments.
:param source_loader: An initiated SourceLoader object pointed at the directory with the sources. :param source_loader: An initiated SourceLoader object pointed at the directory with the sources.
""" """
start_log(docopt_arguments) conf = Configurator()
settings = scrapy_settings_manipulation(docopt_arguments) conf.start_log(docopt_arguments["--log"], docopt_arguments["--verbose"])
setup_crawler(docopt_arguments["<compound>"], settings, source_loader, docopt_arguments["--attributes"].split(',')) conf.set_output(docopt_arguments["--output"], docopt_arguments["--format"])
setup_crawler(docopt_arguments["<compound>"], conf.scrapy_settings, source_loader, docopt_arguments["--attributes"].split(','))
reactor.run() reactor.run()
# The start for the Fourmi Command Line interface. # The start for the Fourmi Command Line interface.
if __name__ == '__main__': if __name__ == '__main__':
arguments = docopt.docopt(__doc__, version='Fourmi - V0.4.2') arguments = docopt.docopt(__doc__, version='Fourmi - V0.5.0')
loader = SourceLoader() loader = SourceLoader()
if arguments["--include"]: if arguments["--include"]:

View File

@ -0,0 +1,50 @@
import unittest
from utils.configurator import Configurator
import ConfigParser
class TestConfigurator(unittest.TestCase):
def setUp(self):
self.conf = Configurator()
def test_set_output(self):
self.conf.set_output(filename="test.txt", fileformat="csv")
self.assertEqual(self.conf.scrapy_settings["FEED_URI"], "test.txt")
self.assertEqual(self.conf.scrapy_settings["FEED_FORMAT"], "csv")
self.conf.set_output("results.*format*", "jsonlines")
self.assertEqual(self.conf.scrapy_settings["FEED_URI"], "results.json")
self.assertEqual(self.conf.scrapy_settings["FEED_FORMAT"], "jsonlines")
self.conf.set_output("results.*format*", "csv")
self.assertEqual(self.conf.scrapy_settings["FEED_URI"], "results.csv")
self.assertEqual(self.conf.scrapy_settings["FEED_FORMAT"], "csv")
# def test_start_log(self):
# self.conf.start_log("test.log", True)
# self.conf.start_log("test.log", False)
# self.conf.start_log(None, True)
# self.conf.start_log(None, False)
def test_read_sourceconfiguration(self):
config = self.conf.read_sourceconfiguration()
self.assertIsInstance(config, ConfigParser.ConfigParser)
def test_get_section(self):
config = ConfigParser.ConfigParser()
section = self.conf.get_section(config, 'test')
self.assertIn('reliability', section)
self.assertEquals(section['reliability'], '')
config.set('DEFAULT', 'reliability', 'Low')
section = self.conf.get_section(config, 'test')
self.assertEquals(section['reliability'], 'Low')
config.add_section('test')
config.set('test', 'var', 'Maybe')
section = self.conf.get_section(config, 'test')
self.assertEquals(section['reliability'], 'Low')
self.assertEqual(section['var'], 'Maybe')

View File

@ -1,6 +1,6 @@
import unittest import unittest
from sourceloader import SourceLoader from utils.sourceloader import SourceLoader
class TestSourceloader(unittest.TestCase): class TestSourceloader(unittest.TestCase):

View File

@ -3,7 +3,7 @@ import unittest
from scrapy.http import Request from scrapy.http import Request
from FourmiCrawler import spider from FourmiCrawler import spider
from FourmiCrawler.sources.ChemSpider import ChemSpider from FourmiCrawler.sources.NIST import NIST
from FourmiCrawler.sources.source import Source from FourmiCrawler.sources.source import Source
@ -41,7 +41,7 @@ class TestFoumiSpider(unittest.TestCase):
self.spi.add_source(src) self.spi.add_source(src)
self.assertEqual(self.spi.start_requests(), []) self.assertEqual(self.spi.start_requests(), [])
src2 = ChemSpider() src2 = NIST()
self.spi.add_source(src2) self.spi.add_source(src2)
requests = self.spi.start_requests() requests = self.spi.start_requests()
self.assertGreater(len(requests), 0) self.assertGreater(len(requests), 0)
@ -57,8 +57,8 @@ class TestFoumiSpider(unittest.TestCase):
self.assertEqual(self.spi.get_synonym_requests("new_compound"), []) self.assertEqual(self.spi.get_synonym_requests("new_compound"), [])
self.assertIn("new_compound", self.spi.synonyms) self.assertIn("new_compound", self.spi.synonyms)
src2 = ChemSpider() src2 = NIST()
self.spi.add_source(src2) self.spi.add_source(src2)
self.assertIsInstance(self.spi.get_synonym_requests("other_compound")[0], Request) self.assertIsInstance(self.spi.get_synonym_requests("other_compound")[0], Request)
self.assertIn("other_compound", self.spi.synonyms) self.assertIn("other_compound", self.spi.synonyms)
self.assertEqual(self.spi.get_synonym_requests("other_compound"), []) self.assertEqual(self.spi.get_synonym_requests("other_compound"), [])

0
utils/__init__.py Normal file
View File

81
utils/configurator.py Normal file
View File

@ -0,0 +1,81 @@
from scrapy import log
from scrapy.utils.project import get_project_settings
import ConfigParser
class Configurator:
"""
A helper class in the fourmi class. This class is used to process the settings as set
from one of the Fourmi applications.
"""
def __init__(self):
self.scrapy_settings = get_project_settings()
def set_output(self, filename, fileformat):
"""
This function manipulates the Scrapy output file settings that normally would be set in the settings file.
In the Fourmi project these are command line arguments.
:param filename: The filename of the file where the output will be put.
:param fileformat: The format in which the output will be.
"""
if filename != 'results.*format*':
self.scrapy_settings.overrides["FEED_URI"] = filename
elif fileformat == "jsonlines":
self.scrapy_settings.overrides["FEED_URI"] = "results.json"
elif fileformat is not None:
self.scrapy_settings.overrides["FEED_URI"] = "results." + fileformat
if fileformat is not None:
self.scrapy_settings.overrides["FEED_FORMAT"] = fileformat
def start_log(self, logfile, verbose):
"""
This function starts the logging functionality of Scrapy using the settings given by the CLI.
:param logfile: The location where the logfile will be saved.
:param verbose: A boolean value to switch between loglevels.
"""
if logfile is not None:
if verbose:
log.start(logfile=logfile, logstdout=False, loglevel=log.DEBUG)
else:
log.start(logfile=logfile, logstdout=True, loglevel=log.WARNING)
else:
if verbose:
log.start(logstdout=False, loglevel=log.DEBUG)
else:
log.start(logstdout=True, loglevel=log.WARNING)
@staticmethod
def read_sourceconfiguration():
"""
This function reads sources.cfg in the main folder for configuration
variables for sources
:return a ConfigParser object of sources.cfg
"""
config = ConfigParser.ConfigParser()
config.read('sources.cfg') # [TODO]: should be softcoded eventually
return config
@staticmethod
def get_section(config, sourcename):
"""
This function reads a config section labeled in variable sourcename and
tests whether the reliability variable is set else set to empty string.
Return the default section if the labeled config section does not exist
:param config: a ConfigParser object
:param sourcename: the name of the section to be read
:return a dictionary of the section in the config labeled in sourcename
"""
section = dict()
if config.has_section(sourcename):
section = dict(config.items(sourcename))
elif config.defaults():
section = config.defaults()
if 'reliability' not in section:
log.msg('Reliability not set for %s' % sourcename,
level=log.WARNING)
section['reliability'] = ''
return section

View File

@ -3,26 +3,31 @@ import os
import re import re
from FourmiCrawler.sources.source import Source from FourmiCrawler.sources.source import Source
from utils.configurator import Configurator
class SourceLoader: class SourceLoader:
sources = [] sources = []
def __init__(self, rel_dir="FourmiCrawler/sources"): def __init__(self, rel_dir="../FourmiCrawler/sources"):
""" """
The initiation of a SourceLoader, selects and indexes a directory for usable sources. The initiation of a SourceLoader, selects and indexes a directory for usable sources.
Also loads a configuration file for Sources and passes the arguments in
the named section to the source
:param rel_dir: A relative path to a directory. :param rel_dir: A relative path to a directory.
""" """
path = os.path.dirname(os.path.abspath(__file__)) path = os.path.dirname(os.path.abspath(__file__))
path += "/" + rel_dir path += "/" + rel_dir
known_parser = set() known_parser = set()
config = Configurator.read_sourceconfiguration()
for py in [f[:-3] for f in os.listdir(path) if f.endswith('.py') and f != '__init__.py']: for py in [f[:-3] for f in os.listdir(path) if f.endswith('.py') and f != '__init__.py']:
mod = __import__('.'.join([rel_dir.replace("/", "."), py]), fromlist=[py]) mod = __import__('.'.join([rel_dir.replace("../", "").replace("/", "."), py]), fromlist=[py])
classes = [getattr(mod, x) for x in dir(mod) if inspect.isclass(getattr(mod, x))] classes = [getattr(mod, x) for x in dir(mod) if inspect.isclass(getattr(mod, x))]
for cls in classes: for cls in classes:
if issubclass(cls, Source) and cls not in known_parser: if issubclass(cls, Source) and cls not in known_parser:
self.sources.append(cls()) # [review] - Would we ever need arguments for the parsers? sourcecfg = Configurator.get_section(config, cls.__name__)
self.sources.append(cls(sourcecfg))
known_parser.add(cls) known_parser.add(cls)
def include(self, source_names): def include(self, source_names):
@ -55,4 +60,4 @@ class SourceLoader:
string += "Source: " + src.__class__.__name__ string += "Source: " + src.__class__.__name__
string += " - " string += " - "
string += "URI: " + src.website + "\n" string += "URI: " + src.website + "\n"
return string return string