From 828928f1ddc23b24fd2cbfa3d225a8922bc5be1d Mon Sep 17 00:00:00 2001 From: RTB Date: Thu, 5 Jun 2014 15:55:01 +0200 Subject: [PATCH 01/56] added sources.cfg to git ignore --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitignore b/.gitignore index 158ef41..14c4e72 100644 --- a/.gitignore +++ b/.gitignore @@ -4,6 +4,9 @@ #Python Specific ignores *.pyc +#may contain authentication information +sources.cfg + #THINGS WE WOULD NEVER EVER WANT! #ignore thumbnails created by windows Thumbs.db From fb3c7602497d864b06a893f93ddedf5f965cb6c3 Mon Sep 17 00:00:00 2001 From: RTB Date: Thu, 5 Jun 2014 16:22:52 +0200 Subject: [PATCH 02/56] sourceloader now reads sources.cfg for source initialization --- sourceloader.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/sourceloader.py b/sourceloader.py index 2ed50a8..5ee22df 100644 --- a/sourceloader.py +++ b/sourceloader.py @@ -1,6 +1,7 @@ import inspect import os import re +import ConfigParser from FourmiCrawler.sources.source import Source @@ -17,11 +18,17 @@ class SourceLoader: path += "/" + rel_dir known_parser = set() + config = ConfigParser.ConfigParser() + config.read('sources.cfg') + for py in [f[:-3] for f in os.listdir(path) if f.endswith('.py') and f != '__init__.py']: mod = __import__('.'.join([rel_dir.replace("/", "."), py]), fromlist=[py]) classes = [getattr(mod, x) for x in dir(mod) if inspect.isclass(getattr(mod, x))] for cls in classes: if issubclass(cls, Source) and cls not in known_parser: + sourcecfg = dict() + if config.has_section(cls.__name__): + sourcecfg = dict(config.items(cls.__name__)) self.sources.append(cls()) # [review] - Would we ever need arguments for the parsers? known_parser.add(cls) @@ -55,4 +62,4 @@ class SourceLoader: string += "Source: " + src.__class__.__name__ string += " - " string += "URI: " + src.website + "\n" - return string \ No newline at end of file + return string From ff3b81b81375cfc2c2f23b0cc9236e1a7356ff47 Mon Sep 17 00:00:00 2001 From: RTB Date: Thu, 5 Jun 2014 16:30:48 +0200 Subject: [PATCH 03/56] each source now receives a configuration dictionary --- FourmiCrawler/sources/ChemSpider.py | 4 ++-- FourmiCrawler/sources/NIST.py | 4 ++-- FourmiCrawler/sources/WikipediaParser.py | 6 +++--- FourmiCrawler/sources/source.py | 2 +- sourceloader.py | 2 +- 5 files changed, 9 insertions(+), 9 deletions(-) diff --git a/FourmiCrawler/sources/ChemSpider.py b/FourmiCrawler/sources/ChemSpider.py index 8c0bd8b..147b9b1 100644 --- a/FourmiCrawler/sources/ChemSpider.py +++ b/FourmiCrawler/sources/ChemSpider.py @@ -20,8 +20,8 @@ class ChemSpider(Source): somewhere. """ - def __init__(self): - Source.__init__(self) + def __init__(self, config): + Source.__init__(self, config) website = 'http://www.chemspider.com/*' diff --git a/FourmiCrawler/sources/NIST.py b/FourmiCrawler/sources/NIST.py index 6e8fabb..b125790 100644 --- a/FourmiCrawler/sources/NIST.py +++ b/FourmiCrawler/sources/NIST.py @@ -24,8 +24,8 @@ class NIST(Source): ignore_list = set() - def __init__(self): - Source.__init__(self) + def __init__(self, config): + Source.__init__(self, config) def parse(self, response): sel = Selector(response) diff --git a/FourmiCrawler/sources/WikipediaParser.py b/FourmiCrawler/sources/WikipediaParser.py index 868b49f..56adc4c 100644 --- a/FourmiCrawler/sources/WikipediaParser.py +++ b/FourmiCrawler/sources/WikipediaParser.py @@ -19,8 +19,8 @@ class WikipediaParser(Source): __spider = None searched_compounds = [] - def __init__(self): - Source.__init__(self) + def __init__(self, config): + Source.__init__(self, config) def parse(self, response): """ Distributes the above described behaviour """ @@ -116,4 +116,4 @@ class WikipediaParser(Source): """ find external links, named 'Identifiers' to different sources. """ links = sel.xpath('//span[contains(concat(" ",normalize-space(@class)," "),"reflink")]/a' '[contains(concat(" ",normalize-space(@class)," "),"external")]/@href').extract() - return links \ No newline at end of file + return links diff --git a/FourmiCrawler/sources/source.py b/FourmiCrawler/sources/source.py index d289d72..603d91f 100644 --- a/FourmiCrawler/sources/source.py +++ b/FourmiCrawler/sources/source.py @@ -6,7 +6,7 @@ class Source: website = "http://something/*" # Regex of URI's the source is able to parse _spider = None - def __init__(self): + def __init__(self, config): """ Initiation of a new Source """ diff --git a/sourceloader.py b/sourceloader.py index 5ee22df..512ca7a 100644 --- a/sourceloader.py +++ b/sourceloader.py @@ -29,7 +29,7 @@ class SourceLoader: sourcecfg = dict() if config.has_section(cls.__name__): sourcecfg = dict(config.items(cls.__name__)) - self.sources.append(cls()) # [review] - Would we ever need arguments for the parsers? + self.sources.append(cls(sourcecfg)) known_parser.add(cls) def include(self, source_names): From eb3eee77a0e898e7a8a424674600ebabcc045c5f Mon Sep 17 00:00:00 2001 From: RTB Date: Thu, 5 Jun 2014 16:50:13 +0200 Subject: [PATCH 04/56] updated function description for __init__ --- sourceloader.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sourceloader.py b/sourceloader.py index 512ca7a..8a6f8b4 100644 --- a/sourceloader.py +++ b/sourceloader.py @@ -12,6 +12,8 @@ class SourceLoader: def __init__(self, rel_dir="FourmiCrawler/sources"): """ The initiation of a SourceLoader, selects and indexes a directory for usable sources. + Also loads a configuration file for Sources and passes the arguments in + the named section to the source :param rel_dir: A relative path to a directory. """ path = os.path.dirname(os.path.abspath(__file__)) From df4ba2f784643fb4095bb0ce3a501393cb3ee1a8 Mon Sep 17 00:00:00 2001 From: RTB Date: Fri, 6 Jun 2014 12:48:30 +0200 Subject: [PATCH 05/56] changed __init__ of all sources to have an empty dictionary as default config value --- FourmiCrawler/sources/ChemSpider.py | 2 +- FourmiCrawler/sources/NIST.py | 2 +- FourmiCrawler/sources/WikipediaParser.py | 2 +- FourmiCrawler/sources/source.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/FourmiCrawler/sources/ChemSpider.py b/FourmiCrawler/sources/ChemSpider.py index 147b9b1..6332530 100644 --- a/FourmiCrawler/sources/ChemSpider.py +++ b/FourmiCrawler/sources/ChemSpider.py @@ -20,7 +20,7 @@ class ChemSpider(Source): somewhere. """ - def __init__(self, config): + def __init__(self, config={}): Source.__init__(self, config) website = 'http://www.chemspider.com/*' diff --git a/FourmiCrawler/sources/NIST.py b/FourmiCrawler/sources/NIST.py index b125790..2d152e4 100644 --- a/FourmiCrawler/sources/NIST.py +++ b/FourmiCrawler/sources/NIST.py @@ -24,7 +24,7 @@ class NIST(Source): ignore_list = set() - def __init__(self, config): + def __init__(self, config={}): Source.__init__(self, config) def parse(self, response): diff --git a/FourmiCrawler/sources/WikipediaParser.py b/FourmiCrawler/sources/WikipediaParser.py index 56adc4c..d0dfd03 100644 --- a/FourmiCrawler/sources/WikipediaParser.py +++ b/FourmiCrawler/sources/WikipediaParser.py @@ -19,7 +19,7 @@ class WikipediaParser(Source): __spider = None searched_compounds = [] - def __init__(self, config): + def __init__(self, config={}): Source.__init__(self, config) def parse(self, response): diff --git a/FourmiCrawler/sources/source.py b/FourmiCrawler/sources/source.py index 603d91f..a609bb9 100644 --- a/FourmiCrawler/sources/source.py +++ b/FourmiCrawler/sources/source.py @@ -6,7 +6,7 @@ class Source: website = "http://something/*" # Regex of URI's the source is able to parse _spider = None - def __init__(self, config): + def __init__(self, config={}): """ Initiation of a new Source """ From 217fb3e9cd27d25964bea5c29a6f023d37336b40 Mon Sep 17 00:00:00 2001 From: RTB Date: Fri, 6 Jun 2014 16:17:46 +0200 Subject: [PATCH 06/56] ChemSpider now uses the token from sources.cfg with checks --- FourmiCrawler/sources/ChemSpider.py | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/FourmiCrawler/sources/ChemSpider.py b/FourmiCrawler/sources/ChemSpider.py index 6332530..1d79019 100644 --- a/FourmiCrawler/sources/ChemSpider.py +++ b/FourmiCrawler/sources/ChemSpider.py @@ -9,7 +9,7 @@ from FourmiCrawler.items import Result # [TODO] - Maybe clean up usage of '.extract()[0]', because of possible IndexError exception. - +# [TODO] - Add checks at search request and extendedCompoundInfo on whether the token was valid or not class ChemSpider(Source): """ChemSpider scraper for synonyms and properties @@ -20,20 +20,28 @@ class ChemSpider(Source): somewhere. """ - def __init__(self, config={}): - Source.__init__(self, config) - website = 'http://www.chemspider.com/*' - # [TODO] - Save and access token of specific user. - search = ('Search.asmx/SimpleSearch?query=%s&token=' - '052bfd06-5ce4-43d6-bf12-89eabefd2338') + search = 'Search.asmx/SimpleSearch?query=%s&token=' structure = 'Chemical-Structure.%s.html' - extendedinfo = ('MassSpecAPI.asmx/GetExtendedCompoundInfo?csid=%s&token=' - '052bfd06-5ce4-43d6-bf12-89eabefd2338') + extendedinfo = 'MassSpecAPI.asmx/GetExtendedCompoundInfo?csid=%s&token=' + cfg = {} ignore_list = [] + def __init__(self, config={}): + Source.__init__(self, config) + self.cfg = config + if 'reliability' not in self.cfg: + log.msg('Reliability not set for ChemSpider', level=log.WARNING) + if 'token' not in self.cfg or self.cfg['token'] == '': + log.msg('ChemSpider token not set or empty, search/MassSpec API ' + 'not available', level=log.WARNING) + self.cfg['token'] = '' + self.search += self.cfg['token'] + self.extendedinfo += self.cfg['token'] + + def parse(self, response): sel = Selector(response) requests = [] @@ -224,7 +232,7 @@ class ChemSpider(Source): callback=self.parse_extendedinfo)] def new_compound_request(self, compound): - if compound in self.ignore_list: # [TODO] - add regular expression + if compound in self.ignore_list or self.cfg['token'] == '': return None searchurl = self.website[:-1] + self.search % compound log.msg('chemspider compound', level=log.DEBUG) From 755c981efa83adb97a8826b07e14431729d982f5 Mon Sep 17 00:00:00 2001 From: RTB Date: Fri, 6 Jun 2014 18:12:31 +0200 Subject: [PATCH 07/56] created newresult function that uses the config for reliability --- FourmiCrawler/sources/ChemSpider.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/FourmiCrawler/sources/ChemSpider.py b/FourmiCrawler/sources/ChemSpider.py index 1d79019..0357477 100644 --- a/FourmiCrawler/sources/ChemSpider.py +++ b/FourmiCrawler/sources/ChemSpider.py @@ -210,6 +210,15 @@ class ChemSpider(Source): properties.append(result) return properties + def newresult(self, attribute, value, conditions, source='ChemSpider'): + return Result({ + 'attribute': attribute, + 'value': value, + 'source': source, + 'reliability': self.cfg['reliability'], + 'conditions': conditions + }) + def parse_searchrequest(self, response): """Parse the initial response of the ChemSpider Search API """ sel = Selector(response) From b3b879d2adb9afa2cf703a7d641e05d75e2b170e Mon Sep 17 00:00:00 2001 From: RTB Date: Fri, 6 Jun 2014 18:17:55 +0200 Subject: [PATCH 08/56] updated parse_extendedinfo to use the newresult function --- FourmiCrawler/sources/ChemSpider.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/FourmiCrawler/sources/ChemSpider.py b/FourmiCrawler/sources/ChemSpider.py index 0357477..ce28930 100644 --- a/FourmiCrawler/sources/ChemSpider.py +++ b/FourmiCrawler/sources/ChemSpider.py @@ -199,18 +199,16 @@ class ChemSpider(Source): names = sel.xpath('*').xpath('name()').extract() values = sel.xpath('*').xpath('text()').extract() for (name, value) in zip(names, values): - result = Result({ - 'attribute': name, - 'value': value, # These values have no unit! - 'source': 'ChemSpider ExtendedCompoundInfo', - 'reliability': 'Unknown', - 'conditions': '' - }) + result = self.newresult( + attribute=name, + value=value, # These values have no unit! + source='ChemSpider ExtendedCompoundInfo', + ) if result['value']: properties.append(result) return properties - def newresult(self, attribute, value, conditions, source='ChemSpider'): + def newresult(self, attribute, value, conditions='', source='ChemSpider'): return Result({ 'attribute': attribute, 'value': value, From 0021953a9a0bf580287923a82fa7aee217da809d Mon Sep 17 00:00:00 2001 From: RTB Date: Fri, 6 Jun 2014 18:20:29 +0200 Subject: [PATCH 09/56] updated parse_properties to use newresult function --- FourmiCrawler/sources/ChemSpider.py | 26 +++++++++++--------------- 1 file changed, 11 insertions(+), 15 deletions(-) diff --git a/FourmiCrawler/sources/ChemSpider.py b/FourmiCrawler/sources/ChemSpider.py index ce28930..38e7684 100644 --- a/FourmiCrawler/sources/ChemSpider.py +++ b/FourmiCrawler/sources/ChemSpider.py @@ -84,13 +84,12 @@ class ChemSpider(Source): prop_value = m.group(1) prop_conditions = m.group(2) - new_prop = Result({ - 'attribute': prop_name, - 'value': prop_value, - 'source': 'ChemSpider Predicted - ACD/Labs Tab', - 'reliability': 'Unknown', - 'conditions': prop_conditions - }) + new_prop = self.newresult( + attribute=prop_name, + value=prop_value, + source='ChemSpider Predicted - ACD/Labs Tab', + conditions=prop_conditions + ) properties.append(new_prop) log.msg('CS prop: |%s| |%s| |%s|' % (new_prop['attribute'], new_prop['value'], new_prop['source']), @@ -108,14 +107,11 @@ class ChemSpider(Source): if line.xpath('span/text()'): property_name = line.xpath('span/text()').extract()[0].rstrip() else: - new_prop = Result({ - 'attribute': property_name[:-1], - 'value': line.xpath('text()').extract()[0].rstrip(), - 'source': line.xpath( - 'strong/text()').extract()[0].rstrip(), - 'reliability': 'Unknown', - 'conditions': '' - }) + new_prop = self.newresult( + attribute=property_name[:-1], + value=line.xpath('text()').extract()[0].rstrip(), + source=line.xpath('strong/text()').extract()[0].rstrip(), + ) properties.append(new_prop) log.msg('CS prop: |%s| |%s| |%s|' % (new_prop['attribute'], new_prop['value'], From de21891bff3ae5bf567ebcb20fc57016af3fc9d1 Mon Sep 17 00:00:00 2001 From: RTB Date: Fri, 6 Jun 2014 18:52:18 +0200 Subject: [PATCH 10/56] created newresult function in NIST.py --- FourmiCrawler/sources/NIST.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/FourmiCrawler/sources/NIST.py b/FourmiCrawler/sources/NIST.py index 2d152e4..016b704 100644 --- a/FourmiCrawler/sources/NIST.py +++ b/FourmiCrawler/sources/NIST.py @@ -269,6 +269,15 @@ class NIST(Source): return results + def newresult(self, attribute, value, conditions=''): + return Result({ + 'attribute': attribute, + 'value': value, + 'source': 'NIST', + 'reliability': self.cfg['reliability'], + 'conditions': conditions + }) + def new_compound_request(self, compound): if compound not in self.ignore_list: self.ignore_list.update(compound) From e347b7538d1c77fe2ca612dd6847a8b45c7d1d09 Mon Sep 17 00:00:00 2001 From: RTB Date: Fri, 6 Jun 2014 18:54:38 +0200 Subject: [PATCH 11/56] updated parse_individidual_datapoints to use newresult function --- FourmiCrawler/sources/NIST.py | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/FourmiCrawler/sources/NIST.py b/FourmiCrawler/sources/NIST.py index 016b704..209df56 100644 --- a/FourmiCrawler/sources/NIST.py +++ b/FourmiCrawler/sources/NIST.py @@ -230,8 +230,7 @@ class NIST(Source): return results - @staticmethod - def parse_individual_datapoints(response): + def parse_individual_datapoints(self, response): """Parses the page linked from aggregate data""" sel = Selector(response) table = sel.xpath('//table[@class="data"]')[0] @@ -258,13 +257,11 @@ class NIST(Source): if m: uncertainty = '+- %s ' % m.group(1) # [TODO]: get the plusminus sign working in here - result = Result({ - 'attribute': name, - 'value': '%s %s%s' % (tds[0], uncertainty, unit), - 'source': 'NIST', - 'reliability': 'Unknown', - 'conditions': condition - }) + result = self.newresult( + attribute=name, + value='%s %s%s' % (tds[0], uncertainty, unit), + conditions=condition + ) results.append(result) return results From a272f9f6d6bd14e0345df2f70237d942d1bc1e70 Mon Sep 17 00:00:00 2001 From: RTB Date: Fri, 6 Jun 2014 18:58:33 +0200 Subject: [PATCH 12/56] updated parse_antoine_data to use newresult function --- FourmiCrawler/sources/NIST.py | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/FourmiCrawler/sources/NIST.py b/FourmiCrawler/sources/NIST.py index 209df56..f20d5ba 100644 --- a/FourmiCrawler/sources/NIST.py +++ b/FourmiCrawler/sources/NIST.py @@ -212,20 +212,17 @@ class NIST(Source): results.append(result) return results - @staticmethod - def parse_antoine_data(table, summary): + def parse_antoine_data(self, table, summary): """Parse table containing parameters for the Antione equation""" results = [] for tr in table.xpath('tr[td]'): tds = tr.xpath('td/text()').extract() - result = Result({ - 'attribute': summary, - 'value': 'A=%s, B=%s, C=%s' % (tds[1], tds[2], tds[3]), - 'source': 'NIST', - 'reliability': 'Unknown', - 'conditions': '%s K' % tds[0] - }) + result = self.newresult( + attribute=summary, + value='A=%s, B=%s, C=%s' % (tds[1], tds[2], tds[3]), + conditions='%s K' % tds[0] + ) results.append(result) return results From ed53889018889201984879ae933b8741b1dffa1d Mon Sep 17 00:00:00 2001 From: RTB Date: Fri, 6 Jun 2014 19:00:04 +0200 Subject: [PATCH 13/56] updated parse_generic_data to use newresult function --- FourmiCrawler/sources/NIST.py | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/FourmiCrawler/sources/NIST.py b/FourmiCrawler/sources/NIST.py index f20d5ba..1d8c91f 100644 --- a/FourmiCrawler/sources/NIST.py +++ b/FourmiCrawler/sources/NIST.py @@ -185,8 +185,7 @@ class NIST(Source): return results - @staticmethod - def parse_generic_data(table, summary): + def parse_generic_data(self, table, summary): """Parses the common tables of 4 and 5 rows. Assumes they are of the form: Symbol (unit)|Temperature (K)|Method|Reference|Comment @@ -202,13 +201,11 @@ class NIST(Source): for tr in table.xpath('tr[td]'): tds = tr.xpath('td/text()').extract() - result = Result({ - 'attribute': summary, - 'value': tds[0] + ' ' + unit, - 'source': 'NIST', - 'reliability': 'Unknown', - 'conditions': '%s K' % tds[1] - }) + result = self.newresult( + attribute=summary, + value=tds[0] + ' ' + unit, + conditions='%s K' % tds[1] + ) results.append(result) return results From c49d76cb660bda1b6dc441164a48a70bc1a3a07f Mon Sep 17 00:00:00 2001 From: RTB Date: Fri, 6 Jun 2014 19:03:51 +0200 Subject: [PATCH 14/56] updated parse_transition_data to use newresult function --- FourmiCrawler/sources/NIST.py | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/FourmiCrawler/sources/NIST.py b/FourmiCrawler/sources/NIST.py index 1d8c91f..19c18a5 100644 --- a/FourmiCrawler/sources/NIST.py +++ b/FourmiCrawler/sources/NIST.py @@ -161,8 +161,7 @@ class NIST(Source): results.append(result) return results - @staticmethod - def parse_transition_data(table, summary): + def parse_transition_data(self, table, summary): """Parses the table containing properties regarding phase changes""" results = [] @@ -174,13 +173,11 @@ class NIST(Source): for tr in table.xpath('tr[td]'): tds = tr.xpath('td/text()').extract() - result = Result({ - 'attribute': summary, - 'value': tds[0] + ' ' + unit, - 'source': 'NIST', - 'reliability': 'Unknown', - 'conditions': '%s K, (%s -> %s)' % (tds[1], tds[2], tds[3]) - }) + result = self.newresult( + attribute=summary, + value=tds[0] + ' ' + unit, + conditions='%s K, (%s -> %s)' % (tds[1], tds[2], tds[3]) + ) results.append(result) return results From 80770de5c0d42d31127b99e5aec3e4627dc46e5c Mon Sep 17 00:00:00 2001 From: RTB Date: Fri, 6 Jun 2014 19:06:22 +0200 Subject: [PATCH 15/56] updated parse_aggregate_data to use newresult function --- FourmiCrawler/sources/NIST.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/FourmiCrawler/sources/NIST.py b/FourmiCrawler/sources/NIST.py index 19c18a5..c684e2c 100644 --- a/FourmiCrawler/sources/NIST.py +++ b/FourmiCrawler/sources/NIST.py @@ -150,13 +150,11 @@ class NIST(Source): name = m.group(1) condition = m.group(2) - result = Result({ - 'attribute': name, - 'value': data[1] + ' ' + data[2], - 'source': 'NIST', - 'reliability': 'Unknown', - 'conditions': condition - }) + result = Result( + attribute=name, + value=data[1] + ' ' + data[2], + conditions=condition + ) log.msg('NIST: |%s|' % data, level=log.DEBUG) results.append(result) return results From a77eafe5130914aacdea630cb2c64c5994b54187 Mon Sep 17 00:00:00 2001 From: RTB Date: Fri, 6 Jun 2014 19:08:22 +0200 Subject: [PATCH 16/56] updated parse_generic_info to use newresult function --- FourmiCrawler/sources/NIST.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/FourmiCrawler/sources/NIST.py b/FourmiCrawler/sources/NIST.py index c684e2c..8f8338a 100644 --- a/FourmiCrawler/sources/NIST.py +++ b/FourmiCrawler/sources/NIST.py @@ -114,13 +114,10 @@ class NIST(Source): requests = [] for key, value in data.iteritems(): - result = Result({ - 'attribute': key, - 'value': value, - 'source': 'NIST', - 'reliability': 'Unknown', - 'conditions': '' - }) + result = self.newresult( + attribute=key, + value=value + ) requests.append(result) return requests From f6f5c5f6fe86eaaaac0a1bc6a32abecd52d0993b Mon Sep 17 00:00:00 2001 From: RTB Date: Fri, 6 Jun 2014 19:13:25 +0200 Subject: [PATCH 17/56] added config to NIST along with reliability check --- FourmiCrawler/sources/NIST.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/FourmiCrawler/sources/NIST.py b/FourmiCrawler/sources/NIST.py index 8f8338a..afafae1 100644 --- a/FourmiCrawler/sources/NIST.py +++ b/FourmiCrawler/sources/NIST.py @@ -23,9 +23,14 @@ class NIST(Source): search = 'cgi/cbook.cgi?Name=%s&Units=SI&cTP=on' ignore_list = set() + cfg = {} def __init__(self, config={}): Source.__init__(self, config) + self.cfg = config + if 'reliability' not in self.cfg or self.cfg['reliability'] == '': + log.msg('Reliability not set for NIST', level=log.WARNING) + self.cfg['reliability'] = '' def parse(self, response): sel = Selector(response) From 981615c6b3539bbad66d34c62b7bca19ed8a274f Mon Sep 17 00:00:00 2001 From: RTB Date: Fri, 6 Jun 2014 19:14:24 +0200 Subject: [PATCH 18/56] chemspider __init__ now sets reliability to empty string if it does not exist in config --- FourmiCrawler/sources/ChemSpider.py | 1 + 1 file changed, 1 insertion(+) diff --git a/FourmiCrawler/sources/ChemSpider.py b/FourmiCrawler/sources/ChemSpider.py index 38e7684..faad02d 100644 --- a/FourmiCrawler/sources/ChemSpider.py +++ b/FourmiCrawler/sources/ChemSpider.py @@ -34,6 +34,7 @@ class ChemSpider(Source): self.cfg = config if 'reliability' not in self.cfg: log.msg('Reliability not set for ChemSpider', level=log.WARNING) + self.cfg['reliability'] = '' if 'token' not in self.cfg or self.cfg['token'] == '': log.msg('ChemSpider token not set or empty, search/MassSpec API ' 'not available', level=log.WARNING) From 68139b483931cbddb44091c1c958003d6e9619a1 Mon Sep 17 00:00:00 2001 From: RTB Date: Fri, 6 Jun 2014 19:27:27 +0200 Subject: [PATCH 19/56] added config to wikipedia along with reliability check --- FourmiCrawler/sources/WikipediaParser.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/FourmiCrawler/sources/WikipediaParser.py b/FourmiCrawler/sources/WikipediaParser.py index d0dfd03..dd27e34 100644 --- a/FourmiCrawler/sources/WikipediaParser.py +++ b/FourmiCrawler/sources/WikipediaParser.py @@ -19,8 +19,15 @@ class WikipediaParser(Source): __spider = None searched_compounds = [] + cfg = {} + def __init__(self, config={}): Source.__init__(self, config) + self.cfg = config + if 'reliability' not in self.cfg or self.cfg['reliability'] == '': + log.msg('Reliability not set for Wikipedia', level=log.WARNING) + self.cfg['reliability'] = '' + def parse(self, response): """ Distributes the above described behaviour """ From 69664d3ac0fefa036ad5dfe99c2641a13a693ad8 Mon Sep 17 00:00:00 2001 From: RTB Date: Fri, 6 Jun 2014 19:30:31 +0200 Subject: [PATCH 20/56] added newrresult function to WikipediaParser.py --- FourmiCrawler/sources/WikipediaParser.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/FourmiCrawler/sources/WikipediaParser.py b/FourmiCrawler/sources/WikipediaParser.py index dd27e34..34b51c0 100644 --- a/FourmiCrawler/sources/WikipediaParser.py +++ b/FourmiCrawler/sources/WikipediaParser.py @@ -124,3 +124,12 @@ class WikipediaParser(Source): links = sel.xpath('//span[contains(concat(" ",normalize-space(@class)," "),"reflink")]/a' '[contains(concat(" ",normalize-space(@class)," "),"external")]/@href').extract() return links + + def newresult(self, attribute, value): + return Result({ + 'attribute': attribute, + 'value': value, + 'source': 'Wikipedia', + 'reliability': self.cfg['reliability'], + 'conditions': '' + }) From 30f00b676d0b1396b538eb95e0127bea2393c12b Mon Sep 17 00:00:00 2001 From: RTB Date: Fri, 6 Jun 2014 20:16:25 +0200 Subject: [PATCH 21/56] updated parse to use newresult function --- FourmiCrawler/sources/WikipediaParser.py | 23 ++++++++--------------- 1 file changed, 8 insertions(+), 15 deletions(-) diff --git a/FourmiCrawler/sources/WikipediaParser.py b/FourmiCrawler/sources/WikipediaParser.py index 34b51c0..781d08f 100644 --- a/FourmiCrawler/sources/WikipediaParser.py +++ b/FourmiCrawler/sources/WikipediaParser.py @@ -28,7 +28,6 @@ class WikipediaParser(Source): log.msg('Reliability not set for Wikipedia', level=log.WARNING) self.cfg['reliability'] = '' - def parse(self, response): """ Distributes the above described behaviour """ log.msg('A response from %s just arrived!' % response.url, level=log.DEBUG) @@ -51,13 +50,10 @@ class WikipediaParser(Source): prop_names = tr_list[::2] prop_values = tr_list[1::2] for i, prop_name in enumerate(prop_names): - item = Result({ - 'attribute': prop_name.extract().encode('utf-8'), - 'value': prop_values[i].extract().encode('utf-8'), - 'source': "Wikipedia", - 'reliability': "Unknown", - 'conditions': "" - }) + item = self.newresult( + attribute=prop_name.extract().encode('utf-8'), + value=prop_values[i].extract().encode('utf-8') + ) items.append(item) log.msg('Wiki prop: |%s| |%s| |%s|' % (item['attribute'], item['value'], item['source']), level=log.DEBUG) @@ -68,13 +64,10 @@ class WikipediaParser(Source): log.msg('item: %s' % tablerow.xpath('./th').xpath('normalize-space(string())'), level=log.DEBUG) if tablerow.xpath('./th').xpath('normalize-space(string())') and tablerow.xpath('./td').xpath( 'normalize-space(string())'): - item = Result({ - 'attribute': tablerow.xpath('./th').xpath('normalize-space(string())').extract()[0].encode('utf-8'), - 'value': tablerow.xpath('./td').xpath('normalize-space(string())').extract()[0].encode('utf-8'), - 'source': "Wikipedia", - 'reliability': "Unknown", - 'conditions': "" - }) + item = self.newresult( + attribute=tablerow.xpath('./th').xpath('normalize-space(string())').extract()[0].encode('utf-8'), + value=tablerow.xpath('./td').xpath('normalize-space(string())').extract()[0].encode('utf-8'), + ) items.append(item) log.msg( 'Wiki prop: |attribute: %s| |value: %s| |%s|' % (item['attribute'], item['value'], item['source']), From 4eeabd7aba20ccfa1d73fc59a4a9ba38eb71c6df Mon Sep 17 00:00:00 2001 From: RTB Date: Fri, 6 Jun 2014 23:16:22 +0200 Subject: [PATCH 22/56] removed erronous @staticmethod for parse_properties --- FourmiCrawler/sources/ChemSpider.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/FourmiCrawler/sources/ChemSpider.py b/FourmiCrawler/sources/ChemSpider.py index faad02d..7834077 100644 --- a/FourmiCrawler/sources/ChemSpider.py +++ b/FourmiCrawler/sources/ChemSpider.py @@ -53,8 +53,7 @@ class ChemSpider(Source): return requests - @staticmethod - def parse_properties(sel): + def parse_properties(self, sel): """scrape Experimental Data and Predicted ACD/Labs tabs""" properties = [] From a12add5e4c8d034d3590927ed85c3cc56319dea3 Mon Sep 17 00:00:00 2001 From: RTB Date: Fri, 6 Jun 2014 23:29:32 +0200 Subject: [PATCH 23/56] removed @staticmethod from parse_extendedinfo --- FourmiCrawler/sources/ChemSpider.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/FourmiCrawler/sources/ChemSpider.py b/FourmiCrawler/sources/ChemSpider.py index 7834077..8078347 100644 --- a/FourmiCrawler/sources/ChemSpider.py +++ b/FourmiCrawler/sources/ChemSpider.py @@ -187,8 +187,7 @@ class ChemSpider(Source): } return synonym - @staticmethod - def parse_extendedinfo(response): + def parse_extendedinfo(self, response): """Scrape data from the ChemSpider GetExtendedCompoundInfo API""" sel = Selector(response) properties = [] From 012267c31c31cc72ce8377540b7c86533aebb6e7 Mon Sep 17 00:00:00 2001 From: RTB Date: Fri, 6 Jun 2014 23:32:03 +0200 Subject: [PATCH 24/56] fixed result in parse_aggregate_data --- FourmiCrawler/sources/NIST.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/FourmiCrawler/sources/NIST.py b/FourmiCrawler/sources/NIST.py index afafae1..10496ab 100644 --- a/FourmiCrawler/sources/NIST.py +++ b/FourmiCrawler/sources/NIST.py @@ -152,7 +152,7 @@ class NIST(Source): name = m.group(1) condition = m.group(2) - result = Result( + result = self.newresult( attribute=name, value=data[1] + ' ' + data[2], conditions=condition From b847d2d5912288d4f2a860c55f1f6fc2ad81fa87 Mon Sep 17 00:00:00 2001 From: RTB Date: Fri, 6 Jun 2014 23:49:38 +0200 Subject: [PATCH 25/56] replaced ChemSpider() with NIST() due to token issues --- tests/test_spider.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/test_spider.py b/tests/test_spider.py index 89d6cfc..589a571 100644 --- a/tests/test_spider.py +++ b/tests/test_spider.py @@ -3,7 +3,7 @@ import unittest from scrapy.http import Request from FourmiCrawler import spider -from FourmiCrawler.sources.ChemSpider import ChemSpider +from FourmiCrawler.sources.NIST import NIST from FourmiCrawler.sources.source import Source @@ -41,7 +41,7 @@ class TestFoumiSpider(unittest.TestCase): self.spi.add_source(src) self.assertEqual(self.spi.start_requests(), []) - src2 = ChemSpider() + src2 = NIST() self.spi.add_source(src2) requests = self.spi.start_requests() self.assertGreater(len(requests), 0) @@ -57,8 +57,8 @@ class TestFoumiSpider(unittest.TestCase): self.assertEqual(self.spi.get_synonym_requests("new_compound"), []) self.assertIn("new_compound", self.spi.synonyms) - src2 = ChemSpider() + src2 = NIST() self.spi.add_source(src2) self.assertIsInstance(self.spi.get_synonym_requests("other_compound")[0], Request) self.assertIn("other_compound", self.spi.synonyms) - self.assertEqual(self.spi.get_synonym_requests("other_compound"), []) \ No newline at end of file + self.assertEqual(self.spi.get_synonym_requests("other_compound"), []) From 13305f400b8cf6586541431bc6030e2de0fc7f79 Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Sat, 7 Jun 2014 11:57:59 +0200 Subject: [PATCH 26/56] Added coverage to the README.md --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 2b286a0..ef612f6 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,8 @@ # Fourmi -**Master branch**: [![Build Status](https://travis-ci.org/Recondor/Fourmi.svg?branch=master)](https://travis-ci.org/Recondor/Fourmi) +**Master branch**: [![Build Status](https://travis-ci.org/Recondor/Fourmi.svg?branch=master)](https://travis-ci.org/Recondor/Fourmi) [![Coverage Status](https://img.shields.io/coveralls/Recondor/Fourmi.svg)](https://coveralls.io/r/Recondor/Fourmi?branch=master) -**Developing branch**: [![Build Status](https://travis-ci.org/Recondor/Fourmi.svg?branch=develop)](https://travis-ci.org/Recondor/Fourmi) +**Developing branch**: [![Build Status](https://travis-ci.org/Recondor/Fourmi.svg?branch=develop)](https://travis-ci.org/Recondor/Fourmi) [![Coverage Status](https://img.shields.io/coveralls/Recondor/Fourmi.svg)](https://coveralls.io/r/Recondor/Fourmi?branch=develop) Fourmi is an web scraper for chemical substances. The program is designed to be used as a search engine to search multiple chemical databases for a specific From cbab2ac7a4f0ff6c4521fe87d0baa3db3eae94c9 Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Sat, 7 Jun 2014 12:21:00 +0200 Subject: [PATCH 27/56] The difference between class and object variables --- FourmiCrawler/sources/NIST.py | 2 +- FourmiCrawler/spider.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/FourmiCrawler/sources/NIST.py b/FourmiCrawler/sources/NIST.py index 10496ab..3e061ae 100644 --- a/FourmiCrawler/sources/NIST.py +++ b/FourmiCrawler/sources/NIST.py @@ -22,11 +22,11 @@ class NIST(Source): search = 'cgi/cbook.cgi?Name=%s&Units=SI&cTP=on' - ignore_list = set() cfg = {} def __init__(self, config={}): Source.__init__(self, config) + self.ignore_list = set() self.cfg = config if 'reliability' not in self.cfg or self.cfg['reliability'] == '': log.msg('Reliability not set for NIST', level=log.WARNING) diff --git a/FourmiCrawler/spider.py b/FourmiCrawler/spider.py index 60f7363..5c09f07 100644 --- a/FourmiCrawler/spider.py +++ b/FourmiCrawler/spider.py @@ -9,8 +9,6 @@ class FourmiSpider(Spider): A spider writen for the Fourmi Project which calls upon all available sources to request and scrape data. """ name = "FourmiSpider" - _sources = [] - synonyms = set() def __init__(self, compound=None, selected_attributes=[".*"], *args, **kwargs): """ @@ -18,6 +16,8 @@ class FourmiSpider(Spider): :param compound: compound that will be searched. :param selected_attributes: A list of regular expressions that the attributes should match. """ + self._sources = [] + self.synonyms = set() super(FourmiSpider, self).__init__(*args, **kwargs) self.synonyms.add(compound) self.selected_attributes = selected_attributes From 071018cbac4ca513ef5004e48acf4247d68efa20 Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Sun, 8 Jun 2014 11:42:01 +0200 Subject: [PATCH 28/56] Made a different python package for all helpers/utils --- fourmi.py | 2 +- tests/test_sourceloader.py | 2 +- utils/__init__.py | 0 sourceloader.py => utils/sourceloader.py | 0 4 files changed, 2 insertions(+), 2 deletions(-) create mode 100644 utils/__init__.py rename sourceloader.py => utils/sourceloader.py (100%) diff --git a/fourmi.py b/fourmi.py index 3596cf3..e11009e 100755 --- a/fourmi.py +++ b/fourmi.py @@ -30,7 +30,7 @@ from scrapy.utils.project import get_project_settings import docopt from FourmiCrawler.spider import FourmiSpider -from sourceloader import SourceLoader +from utils.sourceloader import SourceLoader def setup_crawler(compound, settings, source_loader, attributes): diff --git a/tests/test_sourceloader.py b/tests/test_sourceloader.py index 1afca2d..9e62057 100644 --- a/tests/test_sourceloader.py +++ b/tests/test_sourceloader.py @@ -1,6 +1,6 @@ import unittest -from sourceloader import SourceLoader +from utils.sourceloader import SourceLoader class TestSourceloader(unittest.TestCase): diff --git a/utils/__init__.py b/utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/sourceloader.py b/utils/sourceloader.py similarity index 100% rename from sourceloader.py rename to utils/sourceloader.py From edc91c227941f2de3047a06c51f9927e907ff7ff Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Sun, 8 Jun 2014 12:04:33 +0200 Subject: [PATCH 29/56] Sourceloader should import dynamically --- utils/sourceloader.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/utils/sourceloader.py b/utils/sourceloader.py index 2ed50a8..b6bb0fd 100644 --- a/utils/sourceloader.py +++ b/utils/sourceloader.py @@ -8,7 +8,7 @@ from FourmiCrawler.sources.source import Source class SourceLoader: sources = [] - def __init__(self, rel_dir="FourmiCrawler/sources"): + def __init__(self, rel_dir="../FourmiCrawler/sources"): """ The initiation of a SourceLoader, selects and indexes a directory for usable sources. :param rel_dir: A relative path to a directory. @@ -18,7 +18,7 @@ class SourceLoader: known_parser = set() for py in [f[:-3] for f in os.listdir(path) if f.endswith('.py') and f != '__init__.py']: - mod = __import__('.'.join([rel_dir.replace("/", "."), py]), fromlist=[py]) + mod = __import__('.'.join([rel_dir.replace("../", "").replace("/", "."), py]), fromlist=[py]) classes = [getattr(mod, x) for x in dir(mod) if inspect.isclass(getattr(mod, x))] for cls in classes: if issubclass(cls, Source) and cls not in known_parser: From 7cafdac7a038fc9a1c5d86d17aeb6026ec220bac Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Sun, 8 Jun 2014 12:08:02 +0200 Subject: [PATCH 30/56] Test all python files --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 34d3a88..f208964 100644 --- a/.travis.yml +++ b/.travis.yml @@ -10,7 +10,7 @@ install: # command to run tests, e.g. python setup.py test script: - - nosetests --with-coverage --cover-package=FourmiCrawler tests + - nosetests --with-coverage tests notifications: slack: descartes2:6sgCzx3PvrO9IIMwKxj12dDM From ee80c6eaa0766115d76322a90c3f92b9635897ab Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Sun, 8 Jun 2014 12:13:04 +0200 Subject: [PATCH 31/56] We don't want to check coverage for code we didn't write --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index f208964..24c5dc5 100644 --- a/.travis.yml +++ b/.travis.yml @@ -10,7 +10,7 @@ install: # command to run tests, e.g. python setup.py test script: - - nosetests --with-coverage tests + - nosetests --with-coverage --cover-package=FourmiCrawler,utils tests notifications: slack: descartes2:6sgCzx3PvrO9IIMwKxj12dDM From 007549aad815f0a5db9b8ce67c49b54cf1134419 Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Sun, 8 Jun 2014 12:40:07 +0200 Subject: [PATCH 32/56] Github Username Change --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index ef612f6..8cb8d10 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,8 @@ # Fourmi -**Master branch**: [![Build Status](https://travis-ci.org/Recondor/Fourmi.svg?branch=master)](https://travis-ci.org/Recondor/Fourmi) [![Coverage Status](https://img.shields.io/coveralls/Recondor/Fourmi.svg)](https://coveralls.io/r/Recondor/Fourmi?branch=master) +**Master branch**: [![Build Status](https://travis-ci.org/jjdekker/Fourmi.svg?branch=master)](https://travis-ci.org/Recondor/Fourmi) [![Coverage Status](https://img.shields.io/coveralls/jjdekker/Fourmi.svg)](https://coveralls.io/r/Recondor/Fourmi?branch=master) -**Developing branch**: [![Build Status](https://travis-ci.org/Recondor/Fourmi.svg?branch=develop)](https://travis-ci.org/Recondor/Fourmi) [![Coverage Status](https://img.shields.io/coveralls/Recondor/Fourmi.svg)](https://coveralls.io/r/Recondor/Fourmi?branch=develop) +**Developing branch**: [![Build Status](https://travis-ci.org/jjdekker/Fourmi.svg?branch=develop)](https://travis-ci.org/Recondor/Fourmi) [![Coverage Status](https://img.shields.io/coveralls/jjdekker/Fourmi.svg)](https://coveralls.io/r/Recondor/Fourmi?branch=develop) Fourmi is an web scraper for chemical substances. The program is designed to be used as a search engine to search multiple chemical databases for a specific From 90129f41ccf1db9363736ec3373a30d5a2c56d4f Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Sun, 8 Jun 2014 12:42:21 +0200 Subject: [PATCH 33/56] Added the configuration of the scrapy settings as a new module --- fourmi.py | 46 +++++-------------------------------------- utils/configurator.py | 43 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 48 insertions(+), 41 deletions(-) create mode 100644 utils/configurator.py diff --git a/fourmi.py b/fourmi.py index e11009e..30b5a03 100755 --- a/fourmi.py +++ b/fourmi.py @@ -30,6 +30,7 @@ from scrapy.utils.project import get_project_settings import docopt from FourmiCrawler.spider import FourmiSpider +from utils.configurator import Configurator from utils.sourceloader import SourceLoader @@ -50,53 +51,16 @@ def setup_crawler(compound, settings, source_loader, attributes): crawler.start() -def scrapy_settings_manipulation(docopt_arguments): - """ - This function manipulates the Scrapy settings that normally would be set in the settings file. In the Fourmi - project these are command line arguments. - :param docopt_arguments: A dictionary generated by docopt containing all CLI arguments. - """ - settings = get_project_settings() - - if docopt_arguments["--output"] != 'result.*format*': - settings.overrides["FEED_URI"] = docopt_arguments["--output"] - elif docopt_arguments["--format"] == "jsonlines": - settings.overrides["FEED_URI"] = "results.json" - elif docopt_arguments["--format"] is not None: - settings.overrides["FEED_URI"] = "results." + docopt_arguments["--format"] - - if docopt_arguments["--format"] is not None: - settings.overrides["FEED_FORMAT"] = docopt_arguments["--format"] - - return settings - - -def start_log(docopt_arguments): - """ - This function starts the logging functionality of Scrapy using the settings given by the CLI. - :param docopt_arguments: A dictionary generated by docopt containing all CLI arguments. - """ - if docopt_arguments["--log"] is not None: - if docopt_arguments["--verbose"]: - log.start(logfile=docopt_arguments["--log"], logstdout=False, loglevel=log.DEBUG) - else: - log.start(logfile=docopt_arguments["--log"], logstdout=True, loglevel=log.WARNING) - else: - if docopt_arguments["--verbose"]: - log.start(logstdout=False, loglevel=log.DEBUG) - else: - log.start(logstdout=True, loglevel=log.WARNING) - - def search(docopt_arguments, source_loader): """ The function that facilitates the search for a specific compound. :param docopt_arguments: A dictionary generated by docopt containing all CLI arguments. :param source_loader: An initiated SourceLoader object pointed at the directory with the sources. """ - start_log(docopt_arguments) - settings = scrapy_settings_manipulation(docopt_arguments) - setup_crawler(docopt_arguments[""], settings, source_loader, docopt_arguments["--attributes"].split(',')) + conf = Configurator() + conf.start_log(docopt_arguments["--log"], docopt_arguments["--verbose"]) + conf.set_output(docopt_arguments["--output"], docopt_arguments["--format"]) + setup_crawler(docopt_arguments[""], conf.scrapy_settings, source_loader, docopt_arguments["--attributes"].split(',')) reactor.run() diff --git a/utils/configurator.py b/utils/configurator.py new file mode 100644 index 0000000..8b7ae8a --- /dev/null +++ b/utils/configurator.py @@ -0,0 +1,43 @@ +from scrapy import log +from scrapy.utils.project import get_project_settings + + +class Configurator: + + def __init__(self): + self.scrapy_settings = get_project_settings() + + + def set_output(self, filename, format): + """ + This function manipulates the Scrapy settings that normally would be set in the settings file. In the Fourmi + project these are command line arguments. + :param docopt_arguments: A dictionary generated by docopt containing all CLI arguments. + """ + + if filename != 'result.*format*': + self.scrapy_settings.overrides["FEED_URI"] = format + elif format == "jsonlines": + self.scrapy_settings.overrides["FEED_URI"] = "results.json" + elif format is not None: + self.scrapy_settings.overrides["FEED_URI"] = "results." + format + + if format is not None: + self.scrapy_settings.overrides["FEED_FORMAT"] = format + + + def start_log(self, logfile, verbose): + """ + This function starts the logging functionality of Scrapy using the settings given by the CLI. + :param docopt_arguments: A dictionary generated by docopt containing all CLI arguments. + """ + if logfile is not None: + if verbose: + log.start(logfile=logfile, logstdout=False, loglevel=log.DEBUG) + else: + log.start(logfile=logfile, logstdout=True, loglevel=log.WARNING) + else: + if verbose: + log.start(logstdout=False, loglevel=log.DEBUG) + else: + log.start(logstdout=True, loglevel=log.WARNING) From d765e7fce43c65ffcb6daac4c2c9005ff47c9ea5 Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Sun, 8 Jun 2014 12:46:50 +0200 Subject: [PATCH 34/56] Edited the documentation of the functions in the configurator --- utils/configurator.py | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/utils/configurator.py b/utils/configurator.py index 8b7ae8a..8e2e7e8 100644 --- a/utils/configurator.py +++ b/utils/configurator.py @@ -8,28 +8,30 @@ class Configurator: self.scrapy_settings = get_project_settings() - def set_output(self, filename, format): + def set_output(self, filename, fileformat): """ - This function manipulates the Scrapy settings that normally would be set in the settings file. In the Fourmi - project these are command line arguments. - :param docopt_arguments: A dictionary generated by docopt containing all CLI arguments. + This function manipulates the Scrapy output file settings that normally would be set in the settings file. + In the Fourmi project these are command line arguments. + :param filename: The filename of the file where the output will be put. + :param fileformat: The format in which the output will be. """ if filename != 'result.*format*': - self.scrapy_settings.overrides["FEED_URI"] = format - elif format == "jsonlines": + self.scrapy_settings.overrides["FEED_URI"] = fileformat + elif fileformat == "jsonlines": self.scrapy_settings.overrides["FEED_URI"] = "results.json" - elif format is not None: - self.scrapy_settings.overrides["FEED_URI"] = "results." + format + elif fileformat is not None: + self.scrapy_settings.overrides["FEED_URI"] = "results." + fileformat - if format is not None: - self.scrapy_settings.overrides["FEED_FORMAT"] = format + if fileformat is not None: + self.scrapy_settings.overrides["FEED_FORMAT"] = fileformat def start_log(self, logfile, verbose): """ This function starts the logging functionality of Scrapy using the settings given by the CLI. - :param docopt_arguments: A dictionary generated by docopt containing all CLI arguments. + :param logfile: The location where the logfile will be saved. + :param verbose: A boolean value to switch between loglevels. """ if logfile is not None: if verbose: From 51239dd34262d4279fe0d221e5e26aac3619c66e Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Sun, 8 Jun 2014 12:47:57 +0200 Subject: [PATCH 35/56] Added a few lines on the configurator itself. --- utils/configurator.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/utils/configurator.py b/utils/configurator.py index 8e2e7e8..380b647 100644 --- a/utils/configurator.py +++ b/utils/configurator.py @@ -3,6 +3,10 @@ from scrapy.utils.project import get_project_settings class Configurator: + """ + A helper class in the fourmi class. This class is used to process the settings as set + from one of the Fourmi applications. + """ def __init__(self): self.scrapy_settings = get_project_settings() From c4ef75cf57183158412d03c0e45c06b1d6e0d8a0 Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Sun, 8 Jun 2014 13:00:32 +0200 Subject: [PATCH 36/56] Uniform naming of result file --- fourmi.py | 2 +- utils/configurator.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/fourmi.py b/fourmi.py index 30b5a03..a959091 100755 --- a/fourmi.py +++ b/fourmi.py @@ -17,7 +17,7 @@ Options: --version Show version. --verbose Verbose logging output. --log= Save log to an file. - -o --output= Output file [default: result.*format*] + -o --output= Output file [default: results.*format*] -f --format= Output formats (supported: csv, json, jsonlines, xml) [default: jsonlines] --include= Include only sources that match these regular expressions split by a comma. --exclude= Exclude the sources that match these regular expressions split by a comma. diff --git a/utils/configurator.py b/utils/configurator.py index 380b647..2b458b6 100644 --- a/utils/configurator.py +++ b/utils/configurator.py @@ -20,7 +20,7 @@ class Configurator: :param fileformat: The format in which the output will be. """ - if filename != 'result.*format*': + if filename != 'results.*format*': self.scrapy_settings.overrides["FEED_URI"] = fileformat elif fileformat == "jsonlines": self.scrapy_settings.overrides["FEED_URI"] = "results.json" From 683de68fb7cb7943125437b21f6b1350325e93ea Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Sun, 8 Jun 2014 13:08:37 +0200 Subject: [PATCH 37/56] Added tests and fixed the output settings --- utils/configurator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/configurator.py b/utils/configurator.py index 2b458b6..90e0320 100644 --- a/utils/configurator.py +++ b/utils/configurator.py @@ -21,7 +21,7 @@ class Configurator: """ if filename != 'results.*format*': - self.scrapy_settings.overrides["FEED_URI"] = fileformat + self.scrapy_settings.overrides["FEED_URI"] = filename elif fileformat == "jsonlines": self.scrapy_settings.overrides["FEED_URI"] = "results.json" elif fileformat is not None: From 351a7d08eae1385243cd3784ddc0cd5d8da41d18 Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Sun, 8 Jun 2014 13:10:18 +0200 Subject: [PATCH 38/56] Added tests for the configurator --- tests/test_configurator.py | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) create mode 100644 tests/test_configurator.py diff --git a/tests/test_configurator.py b/tests/test_configurator.py new file mode 100644 index 0000000..da79096 --- /dev/null +++ b/tests/test_configurator.py @@ -0,0 +1,27 @@ +import unittest +from utils.configurator import Configurator + + +class TestConfigurator(unittest.TestCase): + + def setUp(self): + self.conf = Configurator() + + def test_set_output(self): + self.conf.set_output(filename="test.txt", fileformat="csv") + self.assertEqual(self.conf.scrapy_settings["FEED_URI"], "test.txt") + self.assertEqual(self.conf.scrapy_settings["FEED_FORMAT"], "csv") + + self.conf.set_output("results.*format*", "jsonlines") + self.assertEqual(self.conf.scrapy_settings["FEED_URI"], "results.json") + self.assertEqual(self.conf.scrapy_settings["FEED_FORMAT"], "jsonlines") + + self.conf.set_output("results.*format*", "csv") + self.assertEqual(self.conf.scrapy_settings["FEED_URI"], "results.csv") + self.assertEqual(self.conf.scrapy_settings["FEED_FORMAT"], "csv") + + def test_start_log(self): + self.conf.start_log("test.log", True) + self.conf.start_log("test.log", False) + self.conf.start_log(None, True) + self.conf.start_log(None, False) \ No newline at end of file From a1dd39f92a9fd6860e148360f8ad3e9ca567974a Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Sun, 8 Jun 2014 13:12:35 +0200 Subject: [PATCH 39/56] Made CSV our default format, as it's probably the most likely to be used. --- fourmi.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fourmi.py b/fourmi.py index a959091..68d221a 100755 --- a/fourmi.py +++ b/fourmi.py @@ -18,7 +18,7 @@ Options: --verbose Verbose logging output. --log= Save log to an file. -o --output= Output file [default: results.*format*] - -f --format= Output formats (supported: csv, json, jsonlines, xml) [default: jsonlines] + -f --format= Output formats (supported: csv, json, jsonlines, xml) [default: csv] --include= Include only sources that match these regular expressions split by a comma. --exclude= Exclude the sources that match these regular expressions split by a comma. """ From efa8d45d9c08120374b8cb34a39574565881f3d8 Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Sun, 8 Jun 2014 13:28:14 +0200 Subject: [PATCH 40/56] I don't yet know a way to test the start_log function --- tests/test_configurator.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/test_configurator.py b/tests/test_configurator.py index da79096..8cc61ea 100644 --- a/tests/test_configurator.py +++ b/tests/test_configurator.py @@ -20,8 +20,8 @@ class TestConfigurator(unittest.TestCase): self.assertEqual(self.conf.scrapy_settings["FEED_URI"], "results.csv") self.assertEqual(self.conf.scrapy_settings["FEED_FORMAT"], "csv") - def test_start_log(self): - self.conf.start_log("test.log", True) - self.conf.start_log("test.log", False) - self.conf.start_log(None, True) - self.conf.start_log(None, False) \ No newline at end of file + # def test_start_log(self): + # self.conf.start_log("test.log", True) + # self.conf.start_log("test.log", False) + # self.conf.start_log(None, True) + # self.conf.start_log(None, False) \ No newline at end of file From 98c3fbc590920e1afbeaf036841791e197c7d5e5 Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Sun, 8 Jun 2014 13:33:57 +0200 Subject: [PATCH 41/56] Link of to the right page (Github name change) --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 8cb8d10..48b0419 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,8 @@ # Fourmi -**Master branch**: [![Build Status](https://travis-ci.org/jjdekker/Fourmi.svg?branch=master)](https://travis-ci.org/Recondor/Fourmi) [![Coverage Status](https://img.shields.io/coveralls/jjdekker/Fourmi.svg)](https://coveralls.io/r/Recondor/Fourmi?branch=master) +**Master branch**: [![Build Status](https://travis-ci.org/jjdekker/Fourmi.svg?branch=master)](https://travis-ci.org/jjdekker/Fourmi) [![Coverage Status](https://img.shields.io/coveralls/jjdekker/Fourmi.svg)](https://coveralls.io/r/jjdekker/Fourmi?branch=master) -**Developing branch**: [![Build Status](https://travis-ci.org/jjdekker/Fourmi.svg?branch=develop)](https://travis-ci.org/Recondor/Fourmi) [![Coverage Status](https://img.shields.io/coveralls/jjdekker/Fourmi.svg)](https://coveralls.io/r/Recondor/Fourmi?branch=develop) +**Developing branch**: [![Build Status](https://travis-ci.org/jjdekker/Fourmi.svg?branch=develop)](https://travis-ci.org/jjdekker/Fourmi) [![Coverage Status](https://img.shields.io/coveralls/jjdekker/Fourmi.svg)](https://coveralls.io/r/jjdekker/Fourmi?branch=develop) Fourmi is an web scraper for chemical substances. The program is designed to be used as a search engine to search multiple chemical databases for a specific From 96a7f5acd43e82c7da98da15219349282d9c2a6d Mon Sep 17 00:00:00 2001 From: RTB Date: Sun, 8 Jun 2014 18:52:29 +0200 Subject: [PATCH 42/56] added get_section function to grab sections from sources.cfg --- utils/configurator.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/utils/configurator.py b/utils/configurator.py index 90e0320..db6c111 100644 --- a/utils/configurator.py +++ b/utils/configurator.py @@ -1,6 +1,6 @@ from scrapy import log from scrapy.utils.project import get_project_settings - +import ConfigParser class Configurator: """ @@ -47,3 +47,13 @@ class Configurator: log.start(logstdout=False, loglevel=log.DEBUG) else: log.start(logstdout=True, loglevel=log.WARNING) + + def get_section(self, config, sourcename): + section = dict() + if config.has_section(sourcename): + section = dict(config.items(sourcename)) + if 'reliability' not in section: + log.msg('Reliability not set for %s' % sourcename, + level=log.WARNING) + section['reliability'] = '' + return section From 87ec6e6506d2da848a7a34d52a582e2d86fa6dfe Mon Sep 17 00:00:00 2001 From: RTB Date: Sun, 8 Jun 2014 19:03:01 +0200 Subject: [PATCH 43/56] added read_sourceconfiguration function --- utils/configurator.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/utils/configurator.py b/utils/configurator.py index db6c111..658cf4b 100644 --- a/utils/configurator.py +++ b/utils/configurator.py @@ -48,6 +48,11 @@ class Configurator: else: log.start(logstdout=True, loglevel=log.WARNING) + def read_sourceconfiguration(self): + config = ConfigParser.ConfigParser() + config.read('sources.cfg') # [TODO]: should be softcoded eventually + return config + def get_section(self, config, sourcename): section = dict() if config.has_section(sourcename): From f93ff4a309eebfd5129e652e806e3979e65c0d60 Mon Sep 17 00:00:00 2001 From: RTB Date: Sun, 8 Jun 2014 19:14:33 +0200 Subject: [PATCH 44/56] made read_sourceconfiguration() and get_section() static --- utils/configurator.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/utils/configurator.py b/utils/configurator.py index 658cf4b..a87b28a 100644 --- a/utils/configurator.py +++ b/utils/configurator.py @@ -48,12 +48,14 @@ class Configurator: else: log.start(logstdout=True, loglevel=log.WARNING) - def read_sourceconfiguration(self): + @staticmethod + def read_sourceconfiguration(): config = ConfigParser.ConfigParser() config.read('sources.cfg') # [TODO]: should be softcoded eventually return config - def get_section(self, config, sourcename): + @staticmethod + def get_section(config, sourcename): section = dict() if config.has_section(sourcename): section = dict(config.items(sourcename)) From 3278de2b3a0588814b955f5ca8fa5e92fe507f21 Mon Sep 17 00:00:00 2001 From: RTB Date: Sun, 8 Jun 2014 19:15:07 +0200 Subject: [PATCH 45/56] made sourceloader use static Configurator methods --- utils/sourceloader.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/utils/sourceloader.py b/utils/sourceloader.py index 07f966f..9b33657 100644 --- a/utils/sourceloader.py +++ b/utils/sourceloader.py @@ -1,10 +1,9 @@ import inspect import os import re -import ConfigParser from FourmiCrawler.sources.source import Source - +from utils.configurator import Configurator class SourceLoader: sources = [] @@ -20,17 +19,14 @@ class SourceLoader: path += "/" + rel_dir known_parser = set() - config = ConfigParser.ConfigParser() - config.read('sources.cfg') + config = Configurator.read_sourceconfiguration() for py in [f[:-3] for f in os.listdir(path) if f.endswith('.py') and f != '__init__.py']: mod = __import__('.'.join([rel_dir.replace("../", "").replace("/", "."), py]), fromlist=[py]) classes = [getattr(mod, x) for x in dir(mod) if inspect.isclass(getattr(mod, x))] for cls in classes: if issubclass(cls, Source) and cls not in known_parser: - sourcecfg = dict() - if config.has_section(cls.__name__): - sourcecfg = dict(config.items(cls.__name__)) + sourcecfg = Configurator.get_section(config, cls.__name__) self.sources.append(cls(sourcecfg)) known_parser.add(cls) From 09ab4249baf89dbdc51676d298b6af31a7db4fd0 Mon Sep 17 00:00:00 2001 From: RTB Date: Sun, 8 Jun 2014 19:21:56 +0200 Subject: [PATCH 46/56] added function descriptions of read_sourceonfiguration and get_section --- utils/configurator.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/utils/configurator.py b/utils/configurator.py index a87b28a..83dee1b 100644 --- a/utils/configurator.py +++ b/utils/configurator.py @@ -50,12 +50,24 @@ class Configurator: @staticmethod def read_sourceconfiguration(): + """ + This function reads sources.cfg in the main folder for configuration + variables for sources + :return a ConfigParser object of sources.cfg + """ config = ConfigParser.ConfigParser() config.read('sources.cfg') # [TODO]: should be softcoded eventually return config @staticmethod def get_section(config, sourcename): + """ + This function reads a config section labeled in variable sourcename and + tests whether the reliability variable is set else set to empty string + :param config: a ConfigParser object + :param sourcename: the name of the section to be read + :return a dictionary of the section in the config labeled in sourcename + """ section = dict() if config.has_section(sourcename): section = dict(config.items(sourcename)) From a6fb27f8a702afc5bea9603fcd50520a0802ead7 Mon Sep 17 00:00:00 2001 From: RTB Date: Sun, 8 Jun 2014 19:29:29 +0200 Subject: [PATCH 47/56] the difference between class and object variables... --- FourmiCrawler/sources/ChemSpider.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/FourmiCrawler/sources/ChemSpider.py b/FourmiCrawler/sources/ChemSpider.py index 8078347..a9894bb 100644 --- a/FourmiCrawler/sources/ChemSpider.py +++ b/FourmiCrawler/sources/ChemSpider.py @@ -26,12 +26,10 @@ class ChemSpider(Source): structure = 'Chemical-Structure.%s.html' extendedinfo = 'MassSpecAPI.asmx/GetExtendedCompoundInfo?csid=%s&token=' - cfg = {} - ignore_list = [] - def __init__(self, config={}): Source.__init__(self, config) self.cfg = config + self.ignore_list = [] if 'reliability' not in self.cfg: log.msg('Reliability not set for ChemSpider', level=log.WARNING) self.cfg['reliability'] = '' From f01ff62d99e045c0afa0d4cb9b0471593500874c Mon Sep 17 00:00:00 2001 From: RTB Date: Sun, 8 Jun 2014 19:30:05 +0200 Subject: [PATCH 48/56] removed test for existence of reliability in config from chemspider init --- FourmiCrawler/sources/ChemSpider.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/FourmiCrawler/sources/ChemSpider.py b/FourmiCrawler/sources/ChemSpider.py index a9894bb..87a6ee7 100644 --- a/FourmiCrawler/sources/ChemSpider.py +++ b/FourmiCrawler/sources/ChemSpider.py @@ -30,9 +30,6 @@ class ChemSpider(Source): Source.__init__(self, config) self.cfg = config self.ignore_list = [] - if 'reliability' not in self.cfg: - log.msg('Reliability not set for ChemSpider', level=log.WARNING) - self.cfg['reliability'] = '' if 'token' not in self.cfg or self.cfg['token'] == '': log.msg('ChemSpider token not set or empty, search/MassSpec API ' 'not available', level=log.WARNING) From bbc9abadb890694909e914bc84c10bd91eaa079a Mon Sep 17 00:00:00 2001 From: RTB Date: Sun, 8 Jun 2014 19:30:59 +0200 Subject: [PATCH 49/56] removed test for existence of reliability in config from NIST init --- FourmiCrawler/sources/NIST.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/FourmiCrawler/sources/NIST.py b/FourmiCrawler/sources/NIST.py index 3e061ae..3c323ef 100644 --- a/FourmiCrawler/sources/NIST.py +++ b/FourmiCrawler/sources/NIST.py @@ -28,9 +28,6 @@ class NIST(Source): Source.__init__(self, config) self.ignore_list = set() self.cfg = config - if 'reliability' not in self.cfg or self.cfg['reliability'] == '': - log.msg('Reliability not set for NIST', level=log.WARNING) - self.cfg['reliability'] = '' def parse(self, response): sel = Selector(response) From 806b816c302f0cf7efd119ce3bdb485071f0d50e Mon Sep 17 00:00:00 2001 From: RTB Date: Sun, 8 Jun 2014 19:31:34 +0200 Subject: [PATCH 50/56] removed test for existence of reliability in config from WikipediaParser init --- FourmiCrawler/sources/WikipediaParser.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/FourmiCrawler/sources/WikipediaParser.py b/FourmiCrawler/sources/WikipediaParser.py index 781d08f..8722cef 100644 --- a/FourmiCrawler/sources/WikipediaParser.py +++ b/FourmiCrawler/sources/WikipediaParser.py @@ -24,9 +24,6 @@ class WikipediaParser(Source): def __init__(self, config={}): Source.__init__(self, config) self.cfg = config - if 'reliability' not in self.cfg or self.cfg['reliability'] == '': - log.msg('Reliability not set for Wikipedia', level=log.WARNING) - self.cfg['reliability'] = '' def parse(self, response): """ Distributes the above described behaviour """ From a62b40a21fa37a888a8e4ccfba37b73174fe80e2 Mon Sep 17 00:00:00 2001 From: RTB Date: Sun, 8 Jun 2014 20:16:30 +0200 Subject: [PATCH 51/56] get_section in configurator returns the default section if the requested section does not exist --- utils/configurator.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/utils/configurator.py b/utils/configurator.py index 83dee1b..dfc6330 100644 --- a/utils/configurator.py +++ b/utils/configurator.py @@ -63,7 +63,8 @@ class Configurator: def get_section(config, sourcename): """ This function reads a config section labeled in variable sourcename and - tests whether the reliability variable is set else set to empty string + tests whether the reliability variable is set else set to empty string. + Return the default section if the labeled config section does not exist :param config: a ConfigParser object :param sourcename: the name of the section to be read :return a dictionary of the section in the config labeled in sourcename @@ -71,6 +72,8 @@ class Configurator: section = dict() if config.has_section(sourcename): section = dict(config.items(sourcename)) + elif config.defaults(): + section = config.defaults() if 'reliability' not in section: log.msg('Reliability not set for %s' % sourcename, level=log.WARNING) From 8cb6bb8d417160b36d5357cd6467ba712353b9e7 Mon Sep 17 00:00:00 2001 From: RTB Date: Sun, 8 Jun 2014 21:13:35 +0200 Subject: [PATCH 52/56] added simple tests for read_sourceconfiguration and get_section --- tests/test_configurator.py | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/tests/test_configurator.py b/tests/test_configurator.py index 8cc61ea..93e28df 100644 --- a/tests/test_configurator.py +++ b/tests/test_configurator.py @@ -1,6 +1,7 @@ import unittest from utils.configurator import Configurator +import ConfigReader class TestConfigurator(unittest.TestCase): @@ -24,4 +25,26 @@ class TestConfigurator(unittest.TestCase): # self.conf.start_log("test.log", True) # self.conf.start_log("test.log", False) # self.conf.start_log(None, True) - # self.conf.start_log(None, False) \ No newline at end of file + # self.conf.start_log(None, False) + + def test_read_sourceconfiguration(self): + config = self.conf.read_sourceconfiguration() + self.assertIsInstance(config, ConfigReader) + + def test_get_section(self): + config = ConfigReader.ConfigReader() + section = self.conf.get_section(config, 'test') + self.assertIn(section, 'reliability') + self.assertEquals(section['reliability'], '') + + config.set('DEFAULT', 'reliability', 'Low') + + section = self.conf.get_section(config, 'test') + self.assertEquals(section['reliability'] = 'Low') + + config.add_section('test') + config.set('test', 'var', 'Maybe') + + section = self.conf.get_section(config, 'test') + self.assertEquals(section['reliability'] = 'Low') + self.assertEqual(section['var'], 'Maybe') From d141bb9f4fcd8a268b5b9cac1048a8718d041795 Mon Sep 17 00:00:00 2001 From: RTB Date: Sun, 8 Jun 2014 21:19:58 +0200 Subject: [PATCH 53/56] replaced erronous equality signs with commas --- tests/test_configurator.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_configurator.py b/tests/test_configurator.py index 93e28df..236597b 100644 --- a/tests/test_configurator.py +++ b/tests/test_configurator.py @@ -40,11 +40,11 @@ class TestConfigurator(unittest.TestCase): config.set('DEFAULT', 'reliability', 'Low') section = self.conf.get_section(config, 'test') - self.assertEquals(section['reliability'] = 'Low') + self.assertEquals(section['reliability'], 'Low') config.add_section('test') config.set('test', 'var', 'Maybe') section = self.conf.get_section(config, 'test') - self.assertEquals(section['reliability'] = 'Low') + self.assertEquals(section['reliability'], 'Low') self.assertEqual(section['var'], 'Maybe') From a43d90ae69668dd580cf298393133d51ce17db87 Mon Sep 17 00:00:00 2001 From: RTB Date: Sun, 8 Jun 2014 21:26:17 +0200 Subject: [PATCH 54/56] replaced erronous ConfigReader with ConfigParser... --- tests/test_configurator.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/test_configurator.py b/tests/test_configurator.py index 236597b..533db5d 100644 --- a/tests/test_configurator.py +++ b/tests/test_configurator.py @@ -1,7 +1,7 @@ import unittest from utils.configurator import Configurator -import ConfigReader +import ConfigParser class TestConfigurator(unittest.TestCase): @@ -29,10 +29,10 @@ class TestConfigurator(unittest.TestCase): def test_read_sourceconfiguration(self): config = self.conf.read_sourceconfiguration() - self.assertIsInstance(config, ConfigReader) + self.assertIsInstance(config, ConfigParser) def test_get_section(self): - config = ConfigReader.ConfigReader() + config = ConfigParser.ConfigParser() section = self.conf.get_section(config, 'test') self.assertIn(section, 'reliability') self.assertEquals(section['reliability'], '') From 326413effa07cffa2388c7ecd414e699f7cef4cf Mon Sep 17 00:00:00 2001 From: RTB Date: Sun, 8 Jun 2014 21:42:38 +0200 Subject: [PATCH 55/56] fixed syntax of erronous assertIn and assertIsInstance commands --- tests/test_configurator.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_configurator.py b/tests/test_configurator.py index 533db5d..eb43cb7 100644 --- a/tests/test_configurator.py +++ b/tests/test_configurator.py @@ -29,12 +29,12 @@ class TestConfigurator(unittest.TestCase): def test_read_sourceconfiguration(self): config = self.conf.read_sourceconfiguration() - self.assertIsInstance(config, ConfigParser) + self.assertIsInstance(config, ConfigParser.ConfigParser) def test_get_section(self): config = ConfigParser.ConfigParser() section = self.conf.get_section(config, 'test') - self.assertIn(section, 'reliability') + self.assertIn('reliability', section) self.assertEquals(section['reliability'], '') config.set('DEFAULT', 'reliability', 'Low') From e7da0bb1772e627b8f914db6a64431e6f1467427 Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Sun, 8 Jun 2014 22:39:27 +0200 Subject: [PATCH 56/56] Bumped version number --- fourmi.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fourmi.py b/fourmi.py index 68d221a..e6d7e9a 100755 --- a/fourmi.py +++ b/fourmi.py @@ -66,7 +66,7 @@ def search(docopt_arguments, source_loader): # The start for the Fourmi Command Line interface. if __name__ == '__main__': - arguments = docopt.docopt(__doc__, version='Fourmi - V0.4.2') + arguments = docopt.docopt(__doc__, version='Fourmi - V0.5.0') loader = SourceLoader() if arguments["--include"]: