From ca90796904b8d3ff0b0cd0c00f87b1761ddca7ad Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Sun, 1 Jun 2014 19:53:37 +0200 Subject: [PATCH 01/37] Added documentation to the Executable Python file --- fourmi.py | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/fourmi.py b/fourmi.py index efa4e54..c09087d 100755 --- a/fourmi.py +++ b/fourmi.py @@ -34,6 +34,13 @@ from sourceloader import SourceLoader def setup_crawler(searchable, settings, source_loader, attributes): + """ + This function prepares and start the crawler which starts the actual search on the internet + :param searchable: The compound which should be searched + :param settings: A scrapy settings object + :param source_loader: A fully functional SourceLoader object which contains only the sources that should be used. + :param attributes: A list of regular expressions which the attribute names should match. + """ spider = FourmiSpider(compound=searchable, selected_attributes=attributes) spider.add_parsers(source_loader.sources) crawler = Crawler(settings) @@ -44,8 +51,13 @@ def setup_crawler(searchable, settings, source_loader, attributes): def scrapy_settings_manipulation(docopt_arguments): + """ + This function manipulates the Scrapy settings that normally would be set in the settings file. In the Fourmi + project these are command line arguments. + :param docopt_arguments: A dictionary generated by docopt containing all CLI arguments. + """ settings = get_project_settings() - # [todo] - add at least a warning for files that already exist + if docopt_arguments["--output"] != 'result.*format*': settings.overrides["FEED_URI"] = docopt_arguments["--output"] elif docopt_arguments["--format"] == "jsonlines": @@ -60,6 +72,10 @@ def scrapy_settings_manipulation(docopt_arguments): def start_log(docopt_arguments): + """ + This function starts the logging functionality of Scrapy using the settings given by the CLI. + :param docopt_arguments: A dictionary generated by docopt containing all CLI arguments. + """ if docopt_arguments["--log"] is not None: if docopt_arguments["--verbose"]: log.start(logfile=docopt_arguments["--log"], logstdout=False, loglevel=log.DEBUG) @@ -73,12 +89,18 @@ def start_log(docopt_arguments): def search(docopt_arguments, source_loader): + """ + The function that facilitates the search for a specific compound. + :param docopt_arguments: A dictionary generated by docopt containing all CLI arguments. + :param source_loader: An initiated SourceLoader object pointed at the directory with the sources. + """ start_log(docopt_arguments) settings = scrapy_settings_manipulation(docopt_arguments) setup_crawler(docopt_arguments[""], settings, source_loader, docopt_arguments["--attributes"].split(',')) reactor.run() +# The start for the Fourmi Command Line interface. if __name__ == '__main__': arguments = docopt.docopt(__doc__, version='Fourmi - V0.3.0') loader = SourceLoader() From e272c9f3425d42446abd1f428448edc944f22319 Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Sun, 1 Jun 2014 19:55:10 +0200 Subject: [PATCH 02/37] Changed a parameter name for clarification --- fourmi.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fourmi.py b/fourmi.py index c09087d..9f32cff 100755 --- a/fourmi.py +++ b/fourmi.py @@ -33,15 +33,15 @@ from FourmiCrawler.spider import FourmiSpider from sourceloader import SourceLoader -def setup_crawler(searchable, settings, source_loader, attributes): +def setup_crawler(compound, settings, source_loader, attributes): """ This function prepares and start the crawler which starts the actual search on the internet - :param searchable: The compound which should be searched + :param compound: The compound which should be searched :param settings: A scrapy settings object :param source_loader: A fully functional SourceLoader object which contains only the sources that should be used. :param attributes: A list of regular expressions which the attribute names should match. """ - spider = FourmiSpider(compound=searchable, selected_attributes=attributes) + spider = FourmiSpider(compound=compound, selected_attributes=attributes) spider.add_parsers(source_loader.sources) crawler = Crawler(settings) crawler.signals.connect(reactor.stop, signal=signals.spider_closed) From a040bc7a0263aed473ab1b5ce2f294aeaad81d2b Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Sun, 1 Jun 2014 20:01:19 +0200 Subject: [PATCH 03/37] Added documentation for the sourceloader --- sourceloader.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/sourceloader.py b/sourceloader.py index 9957a70..2ed50a8 100644 --- a/sourceloader.py +++ b/sourceloader.py @@ -1,6 +1,7 @@ import inspect import os import re + from FourmiCrawler.sources.source import Source @@ -8,6 +9,10 @@ class SourceLoader: sources = [] def __init__(self, rel_dir="FourmiCrawler/sources"): + """ + The initiation of a SourceLoader, selects and indexes a directory for usable sources. + :param rel_dir: A relative path to a directory. + """ path = os.path.dirname(os.path.abspath(__file__)) path += "/" + rel_dir known_parser = set() @@ -21,18 +26,30 @@ class SourceLoader: known_parser.add(cls) def include(self, source_names): + """ + This function excludes all sources that don't match the given regular expressions. + :param source_names: A list of regular expression (strings) + """ new = set() for name in source_names: new.update([src for src in self.sources if re.match(name, src.__class__.__name__)]) self.sources = list(new) def exclude(self, source_names): + """ + This function excludes all sources that match the given regular expressions. + :param source_names: A list of regular expression (strings) + """ exclude = [] for name in source_names: exclude.extend([src for src in self.sources if re.match(name, src.__class__.__name__)]) self.sources = [src for src in self.sources if src not in exclude] def __str__(self): + """ + This function returns a string with all sources currently available in the SourceLoader. + :return: a string with all available sources. + """ string = "" for src in self.sources: string += "Source: " + src.__class__.__name__ From c4876f029baa41dd17197f0fb72fc5c466f71d1d Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Sun, 1 Jun 2014 20:14:47 +0200 Subject: [PATCH 04/37] Added documentation to the FourmiSpider --- FourmiCrawler/spider.py | 34 +++++++++++++++++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) diff --git a/FourmiCrawler/spider.py b/FourmiCrawler/spider.py index 87f22c6..8ec18cc 100644 --- a/FourmiCrawler/spider.py +++ b/FourmiCrawler/spider.py @@ -1,19 +1,34 @@ +import re + from scrapy.spider import Spider from scrapy import log -import re class FourmiSpider(Spider): + """ + A spider writen for the Fourmi Project which calls upon all available sources to request and scrape data. + """ name = "FourmiSpider" __parsers = [] synonyms = [] def __init__(self, compound=None, selected_attributes=[".*"], *args, **kwargs): + """ + Initiation of the Spider + :param compound: compound that will be searched. + :param selected_attributes: A list of regular expressions that the attributes should match. + """ super(FourmiSpider, self).__init__(*args, **kwargs) self.synonyms.append(compound) self.selected_attributes = selected_attributes; def parse(self, reponse): + """ + The function that is called when a response to a request is available. This function distributes this to a + parser which should be able to handle parsing the data. + :param reponse: A Scrapy Response object that should be parsed + :return: A list of Result items and new Request to be handled by the scrapy core. + """ for parser in self.__parsers: if re.match(parser.website, reponse.url): log.msg("Url: " + reponse.url + " -> Source: " + parser.website, level=log.DEBUG) @@ -21,6 +36,11 @@ class FourmiSpider(Spider): return None def get_synonym_requests(self, compound): + """ + A function that generates new Scrapy Request for each source given a new synonym of a compound. + :param compound: A compound name + :return: A list of Scrapy Request objects + """ requests = [] for parser in self.__parsers: parser_requests = parser.new_compound_request(compound) @@ -29,15 +49,27 @@ class FourmiSpider(Spider): return requests def start_requests(self): + """ + The function called by Scrapy for it's first Requests + :return: A list of Scrapy Request generated from the known synonyms using the available sources. + """ requests = [] for synonym in self.synonyms: requests.extend(self.get_synonym_requests(synonym)) return requests def add_parsers(self, parsers): + """ + A function to add a new Parser objects to the list of available parsers. + :param parsers: A list of Parser Objects. + """ for parser in parsers: self.add_parser(parser) def add_parser(self, parser): + """ + A function add a new Parser object to the list of available parsers. + :param parser: A Parser Object + """ self.__parsers.append(parser) parser.set_spider(self) \ No newline at end of file From 3499946e97be70b98de89566a30999ba0d1666b8 Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Sun, 1 Jun 2014 20:15:15 +0200 Subject: [PATCH 05/37] Fixed a typo --- FourmiCrawler/spider.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/FourmiCrawler/spider.py b/FourmiCrawler/spider.py index 8ec18cc..a58b6ea 100644 --- a/FourmiCrawler/spider.py +++ b/FourmiCrawler/spider.py @@ -22,17 +22,17 @@ class FourmiSpider(Spider): self.synonyms.append(compound) self.selected_attributes = selected_attributes; - def parse(self, reponse): + def parse(self, response): """ The function that is called when a response to a request is available. This function distributes this to a parser which should be able to handle parsing the data. - :param reponse: A Scrapy Response object that should be parsed + :param response: A Scrapy Response object that should be parsed :return: A list of Result items and new Request to be handled by the scrapy core. """ for parser in self.__parsers: - if re.match(parser.website, reponse.url): - log.msg("Url: " + reponse.url + " -> Source: " + parser.website, level=log.DEBUG) - return parser.parse(reponse) + if re.match(parser.website, response.url): + log.msg("Url: " + response.url + " -> Source: " + parser.website, level=log.DEBUG) + return parser.parse(response) return None def get_synonym_requests(self, compound): From c27a875d681d0f912570bef4a583b85ea483bdbe Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Sun, 1 Jun 2014 20:18:03 +0200 Subject: [PATCH 06/37] Parser/Source consistency --- FourmiCrawler/spider.py | 32 ++++++++++++++++---------------- fourmi.py | 2 +- 2 files changed, 17 insertions(+), 17 deletions(-) diff --git a/FourmiCrawler/spider.py b/FourmiCrawler/spider.py index a58b6ea..08abb6b 100644 --- a/FourmiCrawler/spider.py +++ b/FourmiCrawler/spider.py @@ -9,7 +9,7 @@ class FourmiSpider(Spider): A spider writen for the Fourmi Project which calls upon all available sources to request and scrape data. """ name = "FourmiSpider" - __parsers = [] + __sources = [] synonyms = [] def __init__(self, compound=None, selected_attributes=[".*"], *args, **kwargs): @@ -25,14 +25,14 @@ class FourmiSpider(Spider): def parse(self, response): """ The function that is called when a response to a request is available. This function distributes this to a - parser which should be able to handle parsing the data. + source which should be able to handle parsing the data. :param response: A Scrapy Response object that should be parsed :return: A list of Result items and new Request to be handled by the scrapy core. """ - for parser in self.__parsers: - if re.match(parser.website, response.url): - log.msg("Url: " + response.url + " -> Source: " + parser.website, level=log.DEBUG) - return parser.parse(response) + for source in self.__sources: + if re.match(source.website, response.url): + log.msg("Url: " + response.url + " -> Source: " + source.website, level=log.DEBUG) + return source.parse(response) return None def get_synonym_requests(self, compound): @@ -42,7 +42,7 @@ class FourmiSpider(Spider): :return: A list of Scrapy Request objects """ requests = [] - for parser in self.__parsers: + for parser in self.__sources: parser_requests = parser.new_compound_request(compound) if parser_requests is not None: requests.append(parser_requests) @@ -58,18 +58,18 @@ class FourmiSpider(Spider): requests.extend(self.get_synonym_requests(synonym)) return requests - def add_parsers(self, parsers): + def add_sources(self, sources): """ - A function to add a new Parser objects to the list of available parsers. - :param parsers: A list of Parser Objects. + A function to add a new Parser objects to the list of available sources. + :param sources: A list of Source Objects. """ - for parser in parsers: - self.add_parser(parser) + for parser in sources: + self.add_source(parser) - def add_parser(self, parser): + def add_source(self, source): """ A function add a new Parser object to the list of available parsers. - :param parser: A Parser Object + :param source: A Source Object """ - self.__parsers.append(parser) - parser.set_spider(self) \ No newline at end of file + self.__sources.append(source) + source.set_spider(self) \ No newline at end of file diff --git a/fourmi.py b/fourmi.py index 9f32cff..945c8a2 100755 --- a/fourmi.py +++ b/fourmi.py @@ -42,7 +42,7 @@ def setup_crawler(compound, settings, source_loader, attributes): :param attributes: A list of regular expressions which the attribute names should match. """ spider = FourmiSpider(compound=compound, selected_attributes=attributes) - spider.add_parsers(source_loader.sources) + spider.add_sources(source_loader.sources) crawler = Crawler(settings) crawler.signals.connect(reactor.stop, signal=signals.spider_closed) crawler.configure() From f7d0fb4a450c10ab6ce147406f216e537f474c32 Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Sun, 1 Jun 2014 20:24:54 +0200 Subject: [PATCH 07/37] Added documentation to the basic Source --- FourmiCrawler/sources/source.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/FourmiCrawler/sources/source.py b/FourmiCrawler/sources/source.py index 3c51724..1ac0b9e 100644 --- a/FourmiCrawler/sources/source.py +++ b/FourmiCrawler/sources/source.py @@ -7,15 +7,32 @@ class Source: _spider = None def __init__(self): + """ + Initiation of a new Source + """ pass def parse(self, reponse): + """ + This function should be able to parse all Scrapy Response objects with a URL matching the website Regex. + :param reponse: A Scrapy Response object + :return: A list of Result items and new Scrapy Requests + """ log.msg("The parse function of the empty parser was used.", level=log.WARNING) pass def new_compound_request(self, compound): + """ + This function should return a Scrapy Request for the given compound request. + :param compound: A compound name. + :return: A new Scrapy Request + """ # return Request(url=self.website[:-1] + compound, callback=self.parse) pass def set_spider(self, spider): + """ + A Function to save the associated spider. + :param spider: A FourmiSpider object + """ self._spider = spider From f81b1c950074a8ab181b3f91034f58db9c2b8c54 Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Sun, 1 Jun 2014 20:25:46 +0200 Subject: [PATCH 08/37] Fixed a typo --- FourmiCrawler/sources/source.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/FourmiCrawler/sources/source.py b/FourmiCrawler/sources/source.py index 1ac0b9e..d289d72 100644 --- a/FourmiCrawler/sources/source.py +++ b/FourmiCrawler/sources/source.py @@ -12,13 +12,13 @@ class Source: """ pass - def parse(self, reponse): + def parse(self, response): """ This function should be able to parse all Scrapy Response objects with a URL matching the website Regex. - :param reponse: A Scrapy Response object + :param response: A Scrapy Response object :return: A list of Result items and new Scrapy Requests """ - log.msg("The parse function of the empty parser was used.", level=log.WARNING) + log.msg("The parse function of the empty source was used.", level=log.WARNING) pass def new_compound_request(self, compound): From aac0a7c79c661db1c452bc5d31c9b2c77589701c Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Sun, 1 Jun 2014 20:29:51 +0200 Subject: [PATCH 09/37] References to the main Scrapy documentation --- FourmiCrawler/items.py | 4 +--- FourmiCrawler/pipelines.py | 7 +++---- 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/FourmiCrawler/items.py b/FourmiCrawler/items.py index c7fd41c..9f9a516 100644 --- a/FourmiCrawler/items.py +++ b/FourmiCrawler/items.py @@ -1,6 +1,4 @@ -# Define here the models for your scraped items -# -# See documentation in: +# For more information on item definitions, see the Scrapy documentation in: # http://doc.scrapy.org/en/latest/topics/items.html from scrapy.item import Item, Field diff --git a/FourmiCrawler/pipelines.py b/FourmiCrawler/pipelines.py index e1dadbf..ff7ceed 100644 --- a/FourmiCrawler/pipelines.py +++ b/FourmiCrawler/pipelines.py @@ -1,8 +1,7 @@ -# Define your item pipelines here -# -# Don't forget to add your pipeline to the ITEM_PIPELINES setting -# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html +# For more information on item pipelines, see the Scrapy documentation in: +# http://doc.scrapy.org/en/latest/topics/item-pipeline.html import re + from scrapy.exceptions import DropItem From d4a0ffdff3216aa0af7273e639e24fb2034adced Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Wed, 4 Jun 2014 12:01:05 +0200 Subject: [PATCH 10/37] Optimized imports --- FourmiCrawler/sources/ChemSpider.py | 7 +++++-- FourmiCrawler/sources/NIST.py | 7 +++++-- FourmiCrawler/sources/WikipediaParser.py | 6 ++++-- 3 files changed, 14 insertions(+), 6 deletions(-) diff --git a/FourmiCrawler/sources/ChemSpider.py b/FourmiCrawler/sources/ChemSpider.py index 2fcd07c..254c1a5 100644 --- a/FourmiCrawler/sources/ChemSpider.py +++ b/FourmiCrawler/sources/ChemSpider.py @@ -1,9 +1,12 @@ -from source import Source +import re + from scrapy import log from scrapy.http import Request from scrapy.selector import Selector + +from source import Source from FourmiCrawler.items import Result -import re + # [TODO] - Maybe clean up usage of '.extract()[0]', because of possible IndexError exception. diff --git a/FourmiCrawler/sources/NIST.py b/FourmiCrawler/sources/NIST.py index 0b75b17..2fe5966 100644 --- a/FourmiCrawler/sources/NIST.py +++ b/FourmiCrawler/sources/NIST.py @@ -1,9 +1,12 @@ -from source import Source +import re + from scrapy import log from scrapy.http import Request from scrapy.selector import Selector + +from source import Source from FourmiCrawler.items import Result -import re + # [TODO]: values can be '128.', perhaps remove the dot in that case? # [TODO]: properties have references and comments which do not exist in the diff --git a/FourmiCrawler/sources/WikipediaParser.py b/FourmiCrawler/sources/WikipediaParser.py index cb7d0b9..c4f7a0f 100644 --- a/FourmiCrawler/sources/WikipediaParser.py +++ b/FourmiCrawler/sources/WikipediaParser.py @@ -1,9 +1,11 @@ +import re + from scrapy.http import Request from scrapy import log -from source import Source from scrapy.selector import Selector + +from source import Source from FourmiCrawler.items import Result -import re class WikipediaParser(Source): From 7a8c0fe6adefd23cda38218ec63b58454d0a2344 Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Wed, 4 Jun 2014 12:18:15 +0200 Subject: [PATCH 11/37] Added a basic testing structure --- .travis.yml | 15 +++++++++++++++ tests/__init__.py | 1 + tests/test_sourceloader.py | 5 +++++ 3 files changed, 21 insertions(+) create mode 100644 .travis.yml create mode 100644 tests/__init__.py create mode 100644 tests/test_sourceloader.py diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 0000000..7a243ba --- /dev/null +++ b/.travis.yml @@ -0,0 +1,15 @@ +# Config file for automatic testing at travis-ci.org + +language: python +python: 2.7 + +# command to install dependencies, e.g. pip install -r requirements.txt --use-mirrors +install: + - pip install flake8 Scrapy docopt + +# command to run tests, e.g. python setup.py test +script: + - nosetests tests + - make lint + +after_success: coveralls \ No newline at end of file diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..34a27d6 --- /dev/null +++ b/tests/__init__.py @@ -0,0 +1 @@ +__author__ = 'jdekker' diff --git a/tests/test_sourceloader.py b/tests/test_sourceloader.py new file mode 100644 index 0000000..c7ccff9 --- /dev/null +++ b/tests/test_sourceloader.py @@ -0,0 +1,5 @@ +import unittest + + +class TestSourceloader(unittest.TestCase): + pass \ No newline at end of file From c3d2bf92e5c98e3dc5e1990bc1fae074dcca5fe9 Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Wed, 4 Jun 2014 12:43:33 +0200 Subject: [PATCH 12/37] Added tests for the source loader --- tests/test_sourceloader.py | 31 ++++++++++++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) diff --git a/tests/test_sourceloader.py b/tests/test_sourceloader.py index c7ccff9..d97578b 100644 --- a/tests/test_sourceloader.py +++ b/tests/test_sourceloader.py @@ -1,5 +1,34 @@ import unittest +from sourceloader import SourceLoader + class TestSourceloader(unittest.TestCase): - pass \ No newline at end of file + + def setUp(self): + self.loader = SourceLoader() + + def test_init(self): + # Test if sourceloader points to the right directory, where the sources are present. + self.assertIn("Source: Source", str(self.loader)) + self.assertIn("Source: NIST", str(self.loader)) + self.assertIn("Source: ChemSpider", str(self.loader)) + self.assertIn("Source: WikipediaParser", str(self.loader)) + + def test_include(self): + #Tests for the include functionality. + self.loader.include(["So.rc.*"]) + + self.assertIn("Source: Source", str(self.loader)) + self.assertNotIn("Source: NIST", str(self.loader)) + self.assertNotIn("Source: ChemSpider", str(self.loader)) + self.assertNotIn("Source: WikipediaParser", str(self.loader)) + + def test_exclude(self): + #Tests for the exclude functionality. + self.loader.exclude(["So.rc.*"]) + + self.assertNotIn("Source: Source", str(self.loader)) + self.assertIn("Source: NIST", str(self.loader)) + self.assertIn("Source: ChemSpider", str(self.loader)) + self.assertIn("Source: WikipediaParser", str(self.loader)) From 704c5c25deebc473b117d65d5180c251af9c1121 Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Wed, 4 Jun 2014 12:51:15 +0200 Subject: [PATCH 13/37] Travis CI (hopefully working settings) --- .travis.yml | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/.travis.yml b/.travis.yml index 7a243ba..2c4e998 100644 --- a/.travis.yml +++ b/.travis.yml @@ -5,11 +5,8 @@ python: 2.7 # command to install dependencies, e.g. pip install -r requirements.txt --use-mirrors install: - - pip install flake8 Scrapy docopt + - pip install Scrapy docopt # command to run tests, e.g. python setup.py test script: - - nosetests tests - - make lint - -after_success: coveralls \ No newline at end of file + - nosetests tests \ No newline at end of file From 55130ea38884c18c8df12003cf0b9dcf3e80c3ad Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Wed, 4 Jun 2014 12:59:22 +0200 Subject: [PATCH 14/37] Added Travis badges to the README --- README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/README.md b/README.md index e9150a6..7769216 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,8 @@ # Fourmi +**Master branch**: [![Build Status](https://travis-ci.org/Recondor/Fourmi.svg?branch=master)](https://travis-ci.org/Recondor/Fourmi) +**Developing branch**: [![Build Status](https://travis-ci.org/Recondor/Fourmi.svg?branch=develop)](https://travis-ci.org/Recondor/Fourmi) + Fourmi is an web scraper for chemical substances. The program is designed to be used as a search engine to search multiple chemical databases for a specific substance. The program will produce all available attributes of the substance From b9252cc3fd626fe198280c142cc06972e9f16c38 Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Wed, 4 Jun 2014 15:18:27 +0200 Subject: [PATCH 15/37] Removed name from __init__ file --- tests/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/__init__.py b/tests/__init__.py index 34a27d6..8b13789 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -1 +1 @@ -__author__ = 'jdekker' + From b6afb3b2b56b8886e99f8c058ee634a6fa6d9503 Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Wed, 4 Jun 2014 15:22:18 +0200 Subject: [PATCH 16/37] Made Fourmi a python package --- __init__.py | 1 + tests/test_sourceloader.py | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) create mode 100644 __init__.py diff --git a/__init__.py b/__init__.py new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/__init__.py @@ -0,0 +1 @@ + diff --git a/tests/test_sourceloader.py b/tests/test_sourceloader.py index d97578b..cf5ed0f 100644 --- a/tests/test_sourceloader.py +++ b/tests/test_sourceloader.py @@ -2,7 +2,6 @@ import unittest from sourceloader import SourceLoader - class TestSourceloader(unittest.TestCase): def setUp(self): From 26702666b61c3b8734f9d96d9fd851d0252e77a8 Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Wed, 4 Jun 2014 16:06:41 +0200 Subject: [PATCH 17/37] Added travis notification to the slack channel --- .travis.yml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 2c4e998..63c9412 100644 --- a/.travis.yml +++ b/.travis.yml @@ -9,4 +9,7 @@ install: # command to run tests, e.g. python setup.py test script: - - nosetests tests \ No newline at end of file + - nosetests tests + +notifications: + slack: descartes2:6sgCzx3PvrO9IIMwKxj12dDM \ No newline at end of file From c48c4ec697520e9f73fe367a8d7b810cfdf6e277 Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Wed, 4 Jun 2014 16:09:55 +0200 Subject: [PATCH 18/37] None pipeline doesn't need a set --- FourmiCrawler/pipelines.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/FourmiCrawler/pipelines.py b/FourmiCrawler/pipelines.py index 0d4a405..2dfd531 100644 --- a/FourmiCrawler/pipelines.py +++ b/FourmiCrawler/pipelines.py @@ -7,7 +7,7 @@ from scrapy.exceptions import DropItem class RemoveNonePipeline(object): def __init__(self): - self.known_values = set() + pass def process_item(self, item, spider): """ From 75c0be1fea2e35be255c01499dec6c3906c5c868 Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Wed, 4 Jun 2014 16:50:14 +0200 Subject: [PATCH 19/37] Added tests for the pipline --- FourmiCrawler/pipelines.py | 2 +- tests/test_pipeline.py | 48 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 49 insertions(+), 1 deletion(-) create mode 100644 tests/test_pipeline.py diff --git a/FourmiCrawler/pipelines.py b/FourmiCrawler/pipelines.py index 2dfd531..1bcba3a 100644 --- a/FourmiCrawler/pipelines.py +++ b/FourmiCrawler/pipelines.py @@ -35,7 +35,7 @@ class DuplicatePipeline(object): """ value = (item['attribute'], item['value'], item['conditions']) if value in self.known_values: - raise DropItem("Duplicate item found: %s" % item) # #[todo] append sources of first item. + raise DropItem("Duplicate item found: %s" % item) #[todo] append sources of first item. else: self.known_values.add(value) return item diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py new file mode 100644 index 0000000..9500272 --- /dev/null +++ b/tests/test_pipeline.py @@ -0,0 +1,48 @@ +import copy +import unittest +from FourmiCrawler import pipelines, spider, items +from scrapy.exceptions import DropItem + + +class TestPipelines(unittest.TestCase): + + def setUp(self): + self.testItem = items.Result() + + def test_NonePipeline(self): + self.testItem["value"] = "abc" + pipe = pipelines.RemoveNonePipeline() + processed = pipe.process_item(self.testItem, spider.FourmiSpider()) + + self.assertTrue(processed["value"] == "abc") + + for key in self.testItem: + self.assertIsNotNone(processed[key]) + if key is not "value": + self.assertIs(processed[key], "") + + def test_DuplicatePipeline(self): + self.testItem["attribute"] = "test" + self.testItem["value"] = "test" + self.testItem["conditions"] = "test" + + pipe = pipelines.DuplicatePipeline() + self.assertEqual(pipe.process_item(self.testItem, spider.FourmiSpider()), self.testItem) + self.assertRaises(DropItem, pipe.process_item, self.testItem, spider.FourmiSpider()) + + otherItem = copy.deepcopy(self.testItem) + otherItem["value"] = "test1" + self.assertEqual(pipe.process_item(otherItem, spider.FourmiSpider()), otherItem) + + def test_AttributeSelection(self): + item1 = copy.deepcopy(self.testItem) + item2 = copy.deepcopy(self.testItem) + + item1["attribute"] = "abd" + item2["attribute"] = "abc" + + s = spider.FourmiSpider(selected_attributes=["a.d"]) + pipe = pipelines.AttributeSelectionPipeline() + + self.assertEqual(pipe.process_item(item1, s), item1) + self.assertRaises(DropItem, pipe.process_item, item2, s) \ No newline at end of file From 743989edb875c5824c02a450418308142cc29d66 Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Wed, 4 Jun 2014 17:46:03 +0200 Subject: [PATCH 20/37] Second badge on a new line --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 7769216..2b286a0 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,7 @@ # Fourmi **Master branch**: [![Build Status](https://travis-ci.org/Recondor/Fourmi.svg?branch=master)](https://travis-ci.org/Recondor/Fourmi) + **Developing branch**: [![Build Status](https://travis-ci.org/Recondor/Fourmi.svg?branch=develop)](https://travis-ci.org/Recondor/Fourmi) Fourmi is an web scraper for chemical substances. The program is designed to be From b9a8c65d24f9f96258254251eaac47e7a3012744 Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Wed, 4 Jun 2014 17:46:33 +0200 Subject: [PATCH 21/37] For testing, Fourmi should not be a package --- __init__.py | 1 - 1 file changed, 1 deletion(-) delete mode 100644 __init__.py diff --git a/__init__.py b/__init__.py deleted file mode 100644 index 8b13789..0000000 --- a/__init__.py +++ /dev/null @@ -1 +0,0 @@ - From 1557d1787756d099094eada65ac49b09864ec95f Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Wed, 4 Jun 2014 18:22:28 +0200 Subject: [PATCH 22/37] Added documentation to the test cases --- tests/test_pipeline.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py index 9500272..f1fab36 100644 --- a/tests/test_pipeline.py +++ b/tests/test_pipeline.py @@ -1,8 +1,10 @@ import copy import unittest -from FourmiCrawler import pipelines, spider, items + from scrapy.exceptions import DropItem +from FourmiCrawler import pipelines, spider, items + class TestPipelines(unittest.TestCase): @@ -10,6 +12,7 @@ class TestPipelines(unittest.TestCase): self.testItem = items.Result() def test_NonePipeline(self): + #Testing the pipeline that replaces the None values in items. self.testItem["value"] = "abc" pipe = pipelines.RemoveNonePipeline() processed = pipe.process_item(self.testItem, spider.FourmiSpider()) @@ -22,6 +25,7 @@ class TestPipelines(unittest.TestCase): self.assertIs(processed[key], "") def test_DuplicatePipeline(self): + #Testing the pipeline that removes duplicates. self.testItem["attribute"] = "test" self.testItem["value"] = "test" self.testItem["conditions"] = "test" @@ -35,6 +39,7 @@ class TestPipelines(unittest.TestCase): self.assertEqual(pipe.process_item(otherItem, spider.FourmiSpider()), otherItem) def test_AttributeSelection(self): + #Testing the pipeline that selects attributes. item1 = copy.deepcopy(self.testItem) item2 = copy.deepcopy(self.testItem) From f128c5431215f5011c3ae1d26f65a8191e4a0c2d Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Wed, 4 Jun 2014 18:34:31 +0200 Subject: [PATCH 23/37] Sources don't need to be mangled --- FourmiCrawler/spider.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/FourmiCrawler/spider.py b/FourmiCrawler/spider.py index 08abb6b..6a9a12e 100644 --- a/FourmiCrawler/spider.py +++ b/FourmiCrawler/spider.py @@ -9,7 +9,7 @@ class FourmiSpider(Spider): A spider writen for the Fourmi Project which calls upon all available sources to request and scrape data. """ name = "FourmiSpider" - __sources = [] + _sources = [] synonyms = [] def __init__(self, compound=None, selected_attributes=[".*"], *args, **kwargs): @@ -29,7 +29,7 @@ class FourmiSpider(Spider): :param response: A Scrapy Response object that should be parsed :return: A list of Result items and new Request to be handled by the scrapy core. """ - for source in self.__sources: + for source in self._sources: if re.match(source.website, response.url): log.msg("Url: " + response.url + " -> Source: " + source.website, level=log.DEBUG) return source.parse(response) @@ -42,7 +42,7 @@ class FourmiSpider(Spider): :return: A list of Scrapy Request objects """ requests = [] - for parser in self.__sources: + for parser in self._sources: parser_requests = parser.new_compound_request(compound) if parser_requests is not None: requests.append(parser_requests) @@ -71,5 +71,5 @@ class FourmiSpider(Spider): A function add a new Parser object to the list of available parsers. :param source: A Source Object """ - self.__sources.append(source) + self._sources.append(source) source.set_spider(self) \ No newline at end of file From 0c9862d836a2cd9cf41c022fe183190cbf21ea48 Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Wed, 4 Jun 2014 18:54:29 +0200 Subject: [PATCH 24/37] Damn you semicolon! --- FourmiCrawler/spider.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/FourmiCrawler/spider.py b/FourmiCrawler/spider.py index 6a9a12e..fa1c5e2 100644 --- a/FourmiCrawler/spider.py +++ b/FourmiCrawler/spider.py @@ -20,7 +20,7 @@ class FourmiSpider(Spider): """ super(FourmiSpider, self).__init__(*args, **kwargs) self.synonyms.append(compound) - self.selected_attributes = selected_attributes; + self.selected_attributes = selected_attributes def parse(self, response): """ From eb727bd6c4d9cc6dded03a03debc6506b14d7020 Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Wed, 4 Jun 2014 19:12:08 +0200 Subject: [PATCH 25/37] No two requests shall be the same! --- FourmiCrawler/spider.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/FourmiCrawler/spider.py b/FourmiCrawler/spider.py index fa1c5e2..d1b99a7 100644 --- a/FourmiCrawler/spider.py +++ b/FourmiCrawler/spider.py @@ -10,7 +10,7 @@ class FourmiSpider(Spider): """ name = "FourmiSpider" _sources = [] - synonyms = [] + synonyms = set() def __init__(self, compound=None, selected_attributes=[".*"], *args, **kwargs): """ @@ -19,7 +19,7 @@ class FourmiSpider(Spider): :param selected_attributes: A list of regular expressions that the attributes should match. """ super(FourmiSpider, self).__init__(*args, **kwargs) - self.synonyms.append(compound) + self.synonyms.add(compound) self.selected_attributes = selected_attributes def parse(self, response): @@ -42,10 +42,12 @@ class FourmiSpider(Spider): :return: A list of Scrapy Request objects """ requests = [] - for parser in self._sources: - parser_requests = parser.new_compound_request(compound) - if parser_requests is not None: - requests.append(parser_requests) + if compound not in self.synonyms: + self.synonyms.add(compound) + for parser in self._sources: + parser_requests = parser.new_compound_request(compound) + if parser_requests is not None: + requests.append(parser_requests) return requests def start_requests(self): From 918d6729b6828a475923ffe5c5c47851ddd34a91 Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Wed, 4 Jun 2014 19:21:44 +0200 Subject: [PATCH 26/37] Added tests for the Spider --- tests/test_spider.py | 57 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) create mode 100644 tests/test_spider.py diff --git a/tests/test_spider.py b/tests/test_spider.py new file mode 100644 index 0000000..086f1c8 --- /dev/null +++ b/tests/test_spider.py @@ -0,0 +1,57 @@ +import unittest + +from scrapy.http import Request + +from FourmiCrawler import spider +from FourmiCrawler.sources.ChemSpider import ChemSpider +from FourmiCrawler.sources.source import Source + + +class TestFoumiSpider(unittest.TestCase): + + def setUp(self): + self.compound = "test_compound" + self.attributes = ["a.*", ".*a"] + self.spi = spider.FourmiSpider(self.compound, self.attributes) + + def test_init(self): + self.assertIn(self.compound, self.spi.synonyms) + for attr in self.attributes: + self.assertIn(attr, self.spi.selected_attributes) + + def test_add_source(self): + src = Source() + self.spi.add_source(src) + self.assertIn(src, self.spi._sources) + + def test_add_sources(self): + srcs = [Source(), Source(), Source()] + self.spi.add_sources(srcs) + + for src in srcs: + self.assertIn(src, self.spi._sources) + + def test_start_requests(self): + self.spi._sources = [] + + src = Source() + self.spi.add_source(src) + self.assertEqual(self.spi.start_requests(), []) + + src2 = ChemSpider() + self.spi.add_source(src2) + self.assertIsNotNone(self.spi.start_requests()) + + def test_synonym_requests(self): + self.spi._sources = [] + + src = Source() + self.spi.add_source(src) + self.assertEqual(self.spi.get_synonym_requests("new_compound"), []) + self.assertIn("new_compound", self.spi.synonyms) + + src2 = ChemSpider() + self.spi.add_source(src2) + self.assertIsInstance(self.spi.get_synonym_requests("other_compound")[0], Request) + self.assertIn("other_compound", self.spi.synonyms) + self.assertEqual(self.spi.get_synonym_requests("other_compound"), []) \ No newline at end of file From e1c01c7af6d7dc41fb90ab5fd2dda207ac98e35c Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Wed, 4 Jun 2014 19:24:55 +0200 Subject: [PATCH 27/37] Added some documentation for the synonyms request --- tests/test_spider.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tests/test_spider.py b/tests/test_spider.py index 086f1c8..f5c8116 100644 --- a/tests/test_spider.py +++ b/tests/test_spider.py @@ -15,16 +15,19 @@ class TestFoumiSpider(unittest.TestCase): self.spi = spider.FourmiSpider(self.compound, self.attributes) def test_init(self): + # Test the initiation of the Fourmi spider self.assertIn(self.compound, self.spi.synonyms) for attr in self.attributes: self.assertIn(attr, self.spi.selected_attributes) def test_add_source(self): + # Testing the source adding function of the Fourmi spider src = Source() self.spi.add_source(src) self.assertIn(src, self.spi._sources) def test_add_sources(self): + # Testing the function that adds multiple sources srcs = [Source(), Source(), Source()] self.spi.add_sources(srcs) @@ -32,6 +35,7 @@ class TestFoumiSpider(unittest.TestCase): self.assertIn(src, self.spi._sources) def test_start_requests(self): + # A test for the function that generates the start requests self.spi._sources = [] src = Source() @@ -43,6 +47,7 @@ class TestFoumiSpider(unittest.TestCase): self.assertIsNotNone(self.spi.start_requests()) def test_synonym_requests(self): + # A test for the synonym request function self.spi._sources = [] src = Source() From b3c230e83585606467ddee2d00381690b37fccd3 Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Wed, 4 Jun 2014 19:32:41 +0200 Subject: [PATCH 28/37] Import optimazation --- tests/test_sourceloader.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_sourceloader.py b/tests/test_sourceloader.py index cf5ed0f..b130e8d 100644 --- a/tests/test_sourceloader.py +++ b/tests/test_sourceloader.py @@ -1,4 +1,5 @@ import unittest + from sourceloader import SourceLoader From 046fbed3cd40ae463fb5a9c76e3e291ba9fcc2c9 Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Wed, 4 Jun 2014 19:34:23 +0200 Subject: [PATCH 29/37] Code reformat --- FourmiCrawler/pipelines.py | 8 ++++---- FourmiCrawler/settings.py | 2 +- FourmiCrawler/sources/ChemSpider.py | 2 +- FourmiCrawler/sources/NIST.py | 13 ++++++------- FourmiCrawler/sources/WikipediaParser.py | 2 +- fourmi.py | 2 +- tests/test_pipeline.py | 7 +++---- tests/test_sourceloader.py | 5 ++--- tests/test_spider.py | 1 - 9 files changed, 19 insertions(+), 23 deletions(-) diff --git a/FourmiCrawler/pipelines.py b/FourmiCrawler/pipelines.py index 1bcba3a..dd4e11d 100644 --- a/FourmiCrawler/pipelines.py +++ b/FourmiCrawler/pipelines.py @@ -4,8 +4,8 @@ import re from scrapy.exceptions import DropItem -class RemoveNonePipeline(object): +class RemoveNonePipeline(object): def __init__(self): pass @@ -21,8 +21,8 @@ class RemoveNonePipeline(object): item[key] = "" return item -class DuplicatePipeline(object): +class DuplicatePipeline(object): def __init__(self): self.known_values = set() @@ -35,13 +35,13 @@ class DuplicatePipeline(object): """ value = (item['attribute'], item['value'], item['conditions']) if value in self.known_values: - raise DropItem("Duplicate item found: %s" % item) #[todo] append sources of first item. + raise DropItem("Duplicate item found: %s" % item) # [todo] append sources of first item. else: self.known_values.add(value) return item -class AttributeSelectionPipeline(object): +class AttributeSelectionPipeline(object): def __init__(self): pass; diff --git a/FourmiCrawler/settings.py b/FourmiCrawler/settings.py index be7c451..8c1df07 100644 --- a/FourmiCrawler/settings.py +++ b/FourmiCrawler/settings.py @@ -3,7 +3,7 @@ # For simplicity, this file contains only the most important settings by # default. All the other settings are documented here: # -# http://doc.scrapy.org/en/latest/topics/settings.html +# http://doc.scrapy.org/en/latest/topics/settings.html # BOT_NAME = 'FourmiCrawler' diff --git a/FourmiCrawler/sources/ChemSpider.py b/FourmiCrawler/sources/ChemSpider.py index 254c1a5..dfada5f 100644 --- a/FourmiCrawler/sources/ChemSpider.py +++ b/FourmiCrawler/sources/ChemSpider.py @@ -63,7 +63,7 @@ class ChemSpider(Source): # Test for properties without values, with one hardcoded exception if (not re.match(r'^\d', prop_value) or (prop_name == 'Polarizability' and - prop_value == '10-24cm3')): + prop_value == '10-24cm3')): continue # Match for condition in parentheses diff --git a/FourmiCrawler/sources/NIST.py b/FourmiCrawler/sources/NIST.py index 2fe5966..a5f784d 100644 --- a/FourmiCrawler/sources/NIST.py +++ b/FourmiCrawler/sources/NIST.py @@ -10,7 +10,7 @@ from FourmiCrawler.items import Result # [TODO]: values can be '128.', perhaps remove the dot in that case? # [TODO]: properties have references and comments which do not exist in the -# Result item, but should be included eventually. +# Result item, but should be included eventually. class NIST(Source): """NIST Scraper plugin @@ -18,7 +18,7 @@ class NIST(Source): This plugin manages searching for a chemical on the NIST website and parsing the resulting page if the chemical exists on NIST. """ - website = "http://webbook.nist.gov/*" + website = "http://webbook.nist.gov/*" search = 'cgi/cbook.cgi?Name=%s&Units=SI&cTP=on' @@ -78,7 +78,7 @@ class NIST(Source): requests.extend(self.parse_generic_data(table, summary)) else: log.msg('NIST table: NOT SUPPORTED', level=log.WARNING) - continue #Assume unsupported + continue #Assume unsupported return requests def parse_generic_info(self, sel): @@ -106,7 +106,7 @@ class NIST(Source): data['IUPAC Standard InChI'] = raw_inchi.extract()[0] raw_inchikey = ul.xpath('li[strong="IUPAC Standard InChIKey:"]' - '/tt/text()') + '/tt/text()') data['IUPAC Standard InChIKey'] = raw_inchikey.extract()[0] raw_cas_number = ul.xpath('li[strong="CAS Registry Number:"]/text()') @@ -132,10 +132,10 @@ class NIST(Source): results = [] for tr in table.xpath('tr[td]'): extra_data_url = tr.xpath('td[last()][a="Individual data points"]' - '/a/@href').extract() + '/a/@href').extract() if extra_data_url: request = Request(url=self.website[:-1] + extra_data_url[0], - callback=self.parse_individual_datapoints) + callback=self.parse_individual_datapoints) results.append(request) continue data = [] @@ -183,7 +183,6 @@ class NIST(Source): }) results.append(result) - return results @staticmethod diff --git a/FourmiCrawler/sources/WikipediaParser.py b/FourmiCrawler/sources/WikipediaParser.py index c4f7a0f..868b49f 100644 --- a/FourmiCrawler/sources/WikipediaParser.py +++ b/FourmiCrawler/sources/WikipediaParser.py @@ -38,7 +38,7 @@ class WikipediaParser(Source): """ scrape data from infobox on wikipedia. """ items = [] - #be sure to get chembox (wikipedia template) + # be sure to get chembox (wikipedia template) tr_list = sel.xpath('.//table[@class="infobox bordered"]//td[not(@colspan)]'). \ xpath('normalize-space(string())') prop_names = tr_list[::2] diff --git a/fourmi.py b/fourmi.py index b4c2b48..683e257 100755 --- a/fourmi.py +++ b/fourmi.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +# !/usr/bin/env python """ Fourmi, a web scraper build to search specific information for a given compound (and it's pseudonyms). diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py index f1fab36..ab97954 100644 --- a/tests/test_pipeline.py +++ b/tests/test_pipeline.py @@ -7,12 +7,11 @@ from FourmiCrawler import pipelines, spider, items class TestPipelines(unittest.TestCase): - def setUp(self): self.testItem = items.Result() def test_NonePipeline(self): - #Testing the pipeline that replaces the None values in items. + # Testing the pipeline that replaces the None values in items. self.testItem["value"] = "abc" pipe = pipelines.RemoveNonePipeline() processed = pipe.process_item(self.testItem, spider.FourmiSpider()) @@ -25,7 +24,7 @@ class TestPipelines(unittest.TestCase): self.assertIs(processed[key], "") def test_DuplicatePipeline(self): - #Testing the pipeline that removes duplicates. + # Testing the pipeline that removes duplicates. self.testItem["attribute"] = "test" self.testItem["value"] = "test" self.testItem["conditions"] = "test" @@ -39,7 +38,7 @@ class TestPipelines(unittest.TestCase): self.assertEqual(pipe.process_item(otherItem, spider.FourmiSpider()), otherItem) def test_AttributeSelection(self): - #Testing the pipeline that selects attributes. + # Testing the pipeline that selects attributes. item1 = copy.deepcopy(self.testItem) item2 = copy.deepcopy(self.testItem) diff --git a/tests/test_sourceloader.py b/tests/test_sourceloader.py index b130e8d..1afca2d 100644 --- a/tests/test_sourceloader.py +++ b/tests/test_sourceloader.py @@ -4,7 +4,6 @@ from sourceloader import SourceLoader class TestSourceloader(unittest.TestCase): - def setUp(self): self.loader = SourceLoader() @@ -16,7 +15,7 @@ class TestSourceloader(unittest.TestCase): self.assertIn("Source: WikipediaParser", str(self.loader)) def test_include(self): - #Tests for the include functionality. + # Tests for the include functionality. self.loader.include(["So.rc.*"]) self.assertIn("Source: Source", str(self.loader)) @@ -25,7 +24,7 @@ class TestSourceloader(unittest.TestCase): self.assertNotIn("Source: WikipediaParser", str(self.loader)) def test_exclude(self): - #Tests for the exclude functionality. + # Tests for the exclude functionality. self.loader.exclude(["So.rc.*"]) self.assertNotIn("Source: Source", str(self.loader)) diff --git a/tests/test_spider.py b/tests/test_spider.py index f5c8116..66878eb 100644 --- a/tests/test_spider.py +++ b/tests/test_spider.py @@ -8,7 +8,6 @@ from FourmiCrawler.sources.source import Source class TestFoumiSpider(unittest.TestCase): - def setUp(self): self.compound = "test_compound" self.attributes = ["a.*", ".*a"] From 242e0bf628b2492314447c59a974f45b2cc0fc69 Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Wed, 4 Jun 2014 19:43:33 +0200 Subject: [PATCH 30/37] Code inspection --- FourmiCrawler/pipelines.py | 8 +++++--- FourmiCrawler/sources/ChemSpider.py | 4 +--- FourmiCrawler/sources/NIST.py | 5 +++-- tests/test_pipeline.py | 12 ++++++------ 4 files changed, 15 insertions(+), 14 deletions(-) diff --git a/FourmiCrawler/pipelines.py b/FourmiCrawler/pipelines.py index dd4e11d..55b0f49 100644 --- a/FourmiCrawler/pipelines.py +++ b/FourmiCrawler/pipelines.py @@ -9,7 +9,8 @@ class RemoveNonePipeline(object): def __init__(self): pass - def process_item(self, item, spider): + @staticmethod + def process_item(item, spider): """ Processing the items so None values are replaced by empty strings :param item: The incoming item @@ -43,9 +44,10 @@ class DuplicatePipeline(object): class AttributeSelectionPipeline(object): def __init__(self): - pass; + pass - def process_item(self, item, spider): + @staticmethod + def process_item(item, spider): """ The items are processed using the selected attribute list available in the spider, items that don't match the selected items are dropped. diff --git a/FourmiCrawler/sources/ChemSpider.py b/FourmiCrawler/sources/ChemSpider.py index dfada5f..8c0bd8b 100644 --- a/FourmiCrawler/sources/ChemSpider.py +++ b/FourmiCrawler/sources/ChemSpider.py @@ -61,9 +61,7 @@ class ChemSpider(Source): prop_conditions = '' # Test for properties without values, with one hardcoded exception - if (not re.match(r'^\d', prop_value) or - (prop_name == 'Polarizability' and - prop_value == '10-24cm3')): + if not re.match(r'^\d', prop_value) or (prop_name == 'Polarizability' and prop_value == '10-24cm3'): continue # Match for condition in parentheses diff --git a/FourmiCrawler/sources/NIST.py b/FourmiCrawler/sources/NIST.py index a5f784d..6e8fabb 100644 --- a/FourmiCrawler/sources/NIST.py +++ b/FourmiCrawler/sources/NIST.py @@ -78,7 +78,7 @@ class NIST(Source): requests.extend(self.parse_generic_data(table, summary)) else: log.msg('NIST table: NOT SUPPORTED', level=log.WARNING) - continue #Assume unsupported + continue # Assume unsupported return requests def parse_generic_info(self, sel): @@ -230,7 +230,8 @@ class NIST(Source): return results - def parse_individual_datapoints(self, response): + @staticmethod + def parse_individual_datapoints(response): """Parses the page linked from aggregate data""" sel = Selector(response) table = sel.xpath('//table[@class="data"]')[0] diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py index ab97954..dfb8e83 100644 --- a/tests/test_pipeline.py +++ b/tests/test_pipeline.py @@ -10,7 +10,7 @@ class TestPipelines(unittest.TestCase): def setUp(self): self.testItem = items.Result() - def test_NonePipeline(self): + def test_none_pipeline(self): # Testing the pipeline that replaces the None values in items. self.testItem["value"] = "abc" pipe = pipelines.RemoveNonePipeline() @@ -23,7 +23,7 @@ class TestPipelines(unittest.TestCase): if key is not "value": self.assertIs(processed[key], "") - def test_DuplicatePipeline(self): + def test_duplicate_pipeline(self): # Testing the pipeline that removes duplicates. self.testItem["attribute"] = "test" self.testItem["value"] = "test" @@ -33,11 +33,11 @@ class TestPipelines(unittest.TestCase): self.assertEqual(pipe.process_item(self.testItem, spider.FourmiSpider()), self.testItem) self.assertRaises(DropItem, pipe.process_item, self.testItem, spider.FourmiSpider()) - otherItem = copy.deepcopy(self.testItem) - otherItem["value"] = "test1" - self.assertEqual(pipe.process_item(otherItem, spider.FourmiSpider()), otherItem) + other_item = copy.deepcopy(self.testItem) + other_item["value"] = "test1" + self.assertEqual(pipe.process_item(other_item, spider.FourmiSpider()), other_item) - def test_AttributeSelection(self): + def test_attribute_selection(self): # Testing the pipeline that selects attributes. item1 = copy.deepcopy(self.testItem) item2 = copy.deepcopy(self.testItem) From 9ea8dfbe41b4b247b27b5efdf2632d6c89ce3868 Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Wed, 4 Jun 2014 19:52:54 +0200 Subject: [PATCH 31/37] Bumped the version number --- fourmi.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fourmi.py b/fourmi.py index 683e257..57146ab 100755 --- a/fourmi.py +++ b/fourmi.py @@ -102,7 +102,7 @@ def search(docopt_arguments, source_loader): # The start for the Fourmi Command Line interface. if __name__ == '__main__': - arguments = docopt.docopt(__doc__, version='Fourmi - V0.4.0') + arguments = docopt.docopt(__doc__, version='Fourmi - V0.4.1') loader = SourceLoader() if arguments["--include"]: From 7a2ba29e77e322c553626a29268120dcf99f73ed Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Wed, 4 Jun 2014 20:17:30 +0200 Subject: [PATCH 32/37] Added coverage --- .travis.yml | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index 63c9412..099f3e1 100644 --- a/.travis.yml +++ b/.travis.yml @@ -6,10 +6,14 @@ python: 2.7 # command to install dependencies, e.g. pip install -r requirements.txt --use-mirrors install: - pip install Scrapy docopt + - pip install coverall # command to run tests, e.g. python setup.py test script: - - nosetests tests + - nosetests --with-coverage --cover-package=FourmiCrawler tests notifications: - slack: descartes2:6sgCzx3PvrO9IIMwKxj12dDM \ No newline at end of file + slack: descartes2:6sgCzx3PvrO9IIMwKxj12dDM + +after_succes: + coveralls \ No newline at end of file From 90b8ac3285524beb9ba68d2c71bd483889951912 Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Wed, 4 Jun 2014 20:35:36 +0200 Subject: [PATCH 33/37] A little typo --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 099f3e1..ca5ec2e 100644 --- a/.travis.yml +++ b/.travis.yml @@ -6,7 +6,7 @@ python: 2.7 # command to install dependencies, e.g. pip install -r requirements.txt --use-mirrors install: - pip install Scrapy docopt - - pip install coverall + - pip install coveralls # command to run tests, e.g. python setup.py test script: From 56624e4647e765f71c9ae91bb4b4c5565d4f4740 Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Wed, 4 Jun 2014 20:54:41 +0200 Subject: [PATCH 34/37] Not yet uploading stats --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index ca5ec2e..0dd67f5 100644 --- a/.travis.yml +++ b/.travis.yml @@ -16,4 +16,4 @@ notifications: slack: descartes2:6sgCzx3PvrO9IIMwKxj12dDM after_succes: - coveralls \ No newline at end of file + coveralls --verbose \ No newline at end of file From 21d6fbfb2be303172f2393798a7250baf6686a39 Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Wed, 4 Jun 2014 20:59:13 +0200 Subject: [PATCH 35/37] And again it was a typo --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 0dd67f5..34d3a88 100644 --- a/.travis.yml +++ b/.travis.yml @@ -15,5 +15,5 @@ script: notifications: slack: descartes2:6sgCzx3PvrO9IIMwKxj12dDM -after_succes: +after_success: coveralls --verbose \ No newline at end of file From 31790cc10c9ce7219c970df20c02023f9de95a80 Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Thu, 5 Jun 2014 15:45:10 +0200 Subject: [PATCH 36/37] Broken script parameter by the code inspector --- fourmi.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fourmi.py b/fourmi.py index 57146ab..1913c9c 100755 --- a/fourmi.py +++ b/fourmi.py @@ -1,4 +1,4 @@ -# !/usr/bin/env python +#!/usr/bin/env python """ Fourmi, a web scraper build to search specific information for a given compound (and it's pseudonyms). From b68a4e474b2c502c6e34721be3d5dfb8b05b68af Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Thu, 5 Jun 2014 15:46:29 +0200 Subject: [PATCH 37/37] 0.4.2 --- fourmi.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fourmi.py b/fourmi.py index 1913c9c..3596cf3 100755 --- a/fourmi.py +++ b/fourmi.py @@ -102,7 +102,7 @@ def search(docopt_arguments, source_loader): # The start for the Fourmi Command Line interface. if __name__ == '__main__': - arguments = docopt.docopt(__doc__, version='Fourmi - V0.4.1') + arguments = docopt.docopt(__doc__, version='Fourmi - V0.4.2') loader = SourceLoader() if arguments["--include"]: