diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 0000000..63c9412 --- /dev/null +++ b/.travis.yml @@ -0,0 +1,15 @@ +# Config file for automatic testing at travis-ci.org + +language: python +python: 2.7 + +# command to install dependencies, e.g. pip install -r requirements.txt --use-mirrors +install: + - pip install Scrapy docopt + +# command to run tests, e.g. python setup.py test +script: + - nosetests tests + +notifications: + slack: descartes2:6sgCzx3PvrO9IIMwKxj12dDM \ No newline at end of file diff --git a/FourmiCrawler/pipelines.py b/FourmiCrawler/pipelines.py index 0d4a405..55b0f49 100644 --- a/FourmiCrawler/pipelines.py +++ b/FourmiCrawler/pipelines.py @@ -4,12 +4,13 @@ import re from scrapy.exceptions import DropItem + class RemoveNonePipeline(object): - def __init__(self): - self.known_values = set() + pass - def process_item(self, item, spider): + @staticmethod + def process_item(item, spider): """ Processing the items so None values are replaced by empty strings :param item: The incoming item @@ -21,8 +22,8 @@ class RemoveNonePipeline(object): item[key] = "" return item -class DuplicatePipeline(object): +class DuplicatePipeline(object): def __init__(self): self.known_values = set() @@ -35,17 +36,18 @@ class DuplicatePipeline(object): """ value = (item['attribute'], item['value'], item['conditions']) if value in self.known_values: - raise DropItem("Duplicate item found: %s" % item) # #[todo] append sources of first item. + raise DropItem("Duplicate item found: %s" % item) # [todo] append sources of first item. else: self.known_values.add(value) return item + class AttributeSelectionPipeline(object): - def __init__(self): - pass; + pass - def process_item(self, item, spider): + @staticmethod + def process_item(item, spider): """ The items are processed using the selected attribute list available in the spider, items that don't match the selected items are dropped. diff --git a/FourmiCrawler/settings.py b/FourmiCrawler/settings.py index be7c451..8c1df07 100644 --- a/FourmiCrawler/settings.py +++ b/FourmiCrawler/settings.py @@ -3,7 +3,7 @@ # For simplicity, this file contains only the most important settings by # default. All the other settings are documented here: # -# http://doc.scrapy.org/en/latest/topics/settings.html +# http://doc.scrapy.org/en/latest/topics/settings.html # BOT_NAME = 'FourmiCrawler' diff --git a/FourmiCrawler/sources/ChemSpider.py b/FourmiCrawler/sources/ChemSpider.py index 2fcd07c..8c0bd8b 100644 --- a/FourmiCrawler/sources/ChemSpider.py +++ b/FourmiCrawler/sources/ChemSpider.py @@ -1,9 +1,12 @@ -from source import Source +import re + from scrapy import log from scrapy.http import Request from scrapy.selector import Selector + +from source import Source from FourmiCrawler.items import Result -import re + # [TODO] - Maybe clean up usage of '.extract()[0]', because of possible IndexError exception. @@ -58,9 +61,7 @@ class ChemSpider(Source): prop_conditions = '' # Test for properties without values, with one hardcoded exception - if (not re.match(r'^\d', prop_value) or - (prop_name == 'Polarizability' and - prop_value == '10-24cm3')): + if not re.match(r'^\d', prop_value) or (prop_name == 'Polarizability' and prop_value == '10-24cm3'): continue # Match for condition in parentheses diff --git a/FourmiCrawler/sources/NIST.py b/FourmiCrawler/sources/NIST.py index 0b75b17..6e8fabb 100644 --- a/FourmiCrawler/sources/NIST.py +++ b/FourmiCrawler/sources/NIST.py @@ -1,13 +1,16 @@ -from source import Source +import re + from scrapy import log from scrapy.http import Request from scrapy.selector import Selector + +from source import Source from FourmiCrawler.items import Result -import re + # [TODO]: values can be '128.', perhaps remove the dot in that case? # [TODO]: properties have references and comments which do not exist in the -# Result item, but should be included eventually. +# Result item, but should be included eventually. class NIST(Source): """NIST Scraper plugin @@ -15,7 +18,7 @@ class NIST(Source): This plugin manages searching for a chemical on the NIST website and parsing the resulting page if the chemical exists on NIST. """ - website = "http://webbook.nist.gov/*" + website = "http://webbook.nist.gov/*" search = 'cgi/cbook.cgi?Name=%s&Units=SI&cTP=on' @@ -75,7 +78,7 @@ class NIST(Source): requests.extend(self.parse_generic_data(table, summary)) else: log.msg('NIST table: NOT SUPPORTED', level=log.WARNING) - continue #Assume unsupported + continue # Assume unsupported return requests def parse_generic_info(self, sel): @@ -103,7 +106,7 @@ class NIST(Source): data['IUPAC Standard InChI'] = raw_inchi.extract()[0] raw_inchikey = ul.xpath('li[strong="IUPAC Standard InChIKey:"]' - '/tt/text()') + '/tt/text()') data['IUPAC Standard InChIKey'] = raw_inchikey.extract()[0] raw_cas_number = ul.xpath('li[strong="CAS Registry Number:"]/text()') @@ -129,10 +132,10 @@ class NIST(Source): results = [] for tr in table.xpath('tr[td]'): extra_data_url = tr.xpath('td[last()][a="Individual data points"]' - '/a/@href').extract() + '/a/@href').extract() if extra_data_url: request = Request(url=self.website[:-1] + extra_data_url[0], - callback=self.parse_individual_datapoints) + callback=self.parse_individual_datapoints) results.append(request) continue data = [] @@ -180,7 +183,6 @@ class NIST(Source): }) results.append(result) - return results @staticmethod @@ -228,7 +230,8 @@ class NIST(Source): return results - def parse_individual_datapoints(self, response): + @staticmethod + def parse_individual_datapoints(response): """Parses the page linked from aggregate data""" sel = Selector(response) table = sel.xpath('//table[@class="data"]')[0] diff --git a/FourmiCrawler/sources/WikipediaParser.py b/FourmiCrawler/sources/WikipediaParser.py index cb7d0b9..868b49f 100644 --- a/FourmiCrawler/sources/WikipediaParser.py +++ b/FourmiCrawler/sources/WikipediaParser.py @@ -1,9 +1,11 @@ +import re + from scrapy.http import Request from scrapy import log -from source import Source from scrapy.selector import Selector + +from source import Source from FourmiCrawler.items import Result -import re class WikipediaParser(Source): @@ -36,7 +38,7 @@ class WikipediaParser(Source): """ scrape data from infobox on wikipedia. """ items = [] - #be sure to get chembox (wikipedia template) + # be sure to get chembox (wikipedia template) tr_list = sel.xpath('.//table[@class="infobox bordered"]//td[not(@colspan)]'). \ xpath('normalize-space(string())') prop_names = tr_list[::2] diff --git a/FourmiCrawler/spider.py b/FourmiCrawler/spider.py index 08abb6b..d1b99a7 100644 --- a/FourmiCrawler/spider.py +++ b/FourmiCrawler/spider.py @@ -9,8 +9,8 @@ class FourmiSpider(Spider): A spider writen for the Fourmi Project which calls upon all available sources to request and scrape data. """ name = "FourmiSpider" - __sources = [] - synonyms = [] + _sources = [] + synonyms = set() def __init__(self, compound=None, selected_attributes=[".*"], *args, **kwargs): """ @@ -19,8 +19,8 @@ class FourmiSpider(Spider): :param selected_attributes: A list of regular expressions that the attributes should match. """ super(FourmiSpider, self).__init__(*args, **kwargs) - self.synonyms.append(compound) - self.selected_attributes = selected_attributes; + self.synonyms.add(compound) + self.selected_attributes = selected_attributes def parse(self, response): """ @@ -29,7 +29,7 @@ class FourmiSpider(Spider): :param response: A Scrapy Response object that should be parsed :return: A list of Result items and new Request to be handled by the scrapy core. """ - for source in self.__sources: + for source in self._sources: if re.match(source.website, response.url): log.msg("Url: " + response.url + " -> Source: " + source.website, level=log.DEBUG) return source.parse(response) @@ -42,10 +42,12 @@ class FourmiSpider(Spider): :return: A list of Scrapy Request objects """ requests = [] - for parser in self.__sources: - parser_requests = parser.new_compound_request(compound) - if parser_requests is not None: - requests.append(parser_requests) + if compound not in self.synonyms: + self.synonyms.add(compound) + for parser in self._sources: + parser_requests = parser.new_compound_request(compound) + if parser_requests is not None: + requests.append(parser_requests) return requests def start_requests(self): @@ -71,5 +73,5 @@ class FourmiSpider(Spider): A function add a new Parser object to the list of available parsers. :param source: A Source Object """ - self.__sources.append(source) + self._sources.append(source) source.set_spider(self) \ No newline at end of file diff --git a/README.md b/README.md index e9150a6..2b286a0 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,9 @@ # Fourmi +**Master branch**: [![Build Status](https://travis-ci.org/Recondor/Fourmi.svg?branch=master)](https://travis-ci.org/Recondor/Fourmi) + +**Developing branch**: [![Build Status](https://travis-ci.org/Recondor/Fourmi.svg?branch=develop)](https://travis-ci.org/Recondor/Fourmi) + Fourmi is an web scraper for chemical substances. The program is designed to be used as a search engine to search multiple chemical databases for a specific substance. The program will produce all available attributes of the substance diff --git a/fourmi.py b/fourmi.py index b4c2b48..683e257 100755 --- a/fourmi.py +++ b/fourmi.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +# !/usr/bin/env python """ Fourmi, a web scraper build to search specific information for a given compound (and it's pseudonyms). diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/tests/__init__.py @@ -0,0 +1 @@ + diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py new file mode 100644 index 0000000..dfb8e83 --- /dev/null +++ b/tests/test_pipeline.py @@ -0,0 +1,52 @@ +import copy +import unittest + +from scrapy.exceptions import DropItem + +from FourmiCrawler import pipelines, spider, items + + +class TestPipelines(unittest.TestCase): + def setUp(self): + self.testItem = items.Result() + + def test_none_pipeline(self): + # Testing the pipeline that replaces the None values in items. + self.testItem["value"] = "abc" + pipe = pipelines.RemoveNonePipeline() + processed = pipe.process_item(self.testItem, spider.FourmiSpider()) + + self.assertTrue(processed["value"] == "abc") + + for key in self.testItem: + self.assertIsNotNone(processed[key]) + if key is not "value": + self.assertIs(processed[key], "") + + def test_duplicate_pipeline(self): + # Testing the pipeline that removes duplicates. + self.testItem["attribute"] = "test" + self.testItem["value"] = "test" + self.testItem["conditions"] = "test" + + pipe = pipelines.DuplicatePipeline() + self.assertEqual(pipe.process_item(self.testItem, spider.FourmiSpider()), self.testItem) + self.assertRaises(DropItem, pipe.process_item, self.testItem, spider.FourmiSpider()) + + other_item = copy.deepcopy(self.testItem) + other_item["value"] = "test1" + self.assertEqual(pipe.process_item(other_item, spider.FourmiSpider()), other_item) + + def test_attribute_selection(self): + # Testing the pipeline that selects attributes. + item1 = copy.deepcopy(self.testItem) + item2 = copy.deepcopy(self.testItem) + + item1["attribute"] = "abd" + item2["attribute"] = "abc" + + s = spider.FourmiSpider(selected_attributes=["a.d"]) + pipe = pipelines.AttributeSelectionPipeline() + + self.assertEqual(pipe.process_item(item1, s), item1) + self.assertRaises(DropItem, pipe.process_item, item2, s) \ No newline at end of file diff --git a/tests/test_sourceloader.py b/tests/test_sourceloader.py new file mode 100644 index 0000000..1afca2d --- /dev/null +++ b/tests/test_sourceloader.py @@ -0,0 +1,33 @@ +import unittest + +from sourceloader import SourceLoader + + +class TestSourceloader(unittest.TestCase): + def setUp(self): + self.loader = SourceLoader() + + def test_init(self): + # Test if sourceloader points to the right directory, where the sources are present. + self.assertIn("Source: Source", str(self.loader)) + self.assertIn("Source: NIST", str(self.loader)) + self.assertIn("Source: ChemSpider", str(self.loader)) + self.assertIn("Source: WikipediaParser", str(self.loader)) + + def test_include(self): + # Tests for the include functionality. + self.loader.include(["So.rc.*"]) + + self.assertIn("Source: Source", str(self.loader)) + self.assertNotIn("Source: NIST", str(self.loader)) + self.assertNotIn("Source: ChemSpider", str(self.loader)) + self.assertNotIn("Source: WikipediaParser", str(self.loader)) + + def test_exclude(self): + # Tests for the exclude functionality. + self.loader.exclude(["So.rc.*"]) + + self.assertNotIn("Source: Source", str(self.loader)) + self.assertIn("Source: NIST", str(self.loader)) + self.assertIn("Source: ChemSpider", str(self.loader)) + self.assertIn("Source: WikipediaParser", str(self.loader)) diff --git a/tests/test_spider.py b/tests/test_spider.py new file mode 100644 index 0000000..66878eb --- /dev/null +++ b/tests/test_spider.py @@ -0,0 +1,61 @@ +import unittest + +from scrapy.http import Request + +from FourmiCrawler import spider +from FourmiCrawler.sources.ChemSpider import ChemSpider +from FourmiCrawler.sources.source import Source + + +class TestFoumiSpider(unittest.TestCase): + def setUp(self): + self.compound = "test_compound" + self.attributes = ["a.*", ".*a"] + self.spi = spider.FourmiSpider(self.compound, self.attributes) + + def test_init(self): + # Test the initiation of the Fourmi spider + self.assertIn(self.compound, self.spi.synonyms) + for attr in self.attributes: + self.assertIn(attr, self.spi.selected_attributes) + + def test_add_source(self): + # Testing the source adding function of the Fourmi spider + src = Source() + self.spi.add_source(src) + self.assertIn(src, self.spi._sources) + + def test_add_sources(self): + # Testing the function that adds multiple sources + srcs = [Source(), Source(), Source()] + self.spi.add_sources(srcs) + + for src in srcs: + self.assertIn(src, self.spi._sources) + + def test_start_requests(self): + # A test for the function that generates the start requests + self.spi._sources = [] + + src = Source() + self.spi.add_source(src) + self.assertEqual(self.spi.start_requests(), []) + + src2 = ChemSpider() + self.spi.add_source(src2) + self.assertIsNotNone(self.spi.start_requests()) + + def test_synonym_requests(self): + # A test for the synonym request function + self.spi._sources = [] + + src = Source() + self.spi.add_source(src) + self.assertEqual(self.spi.get_synonym_requests("new_compound"), []) + self.assertIn("new_compound", self.spi.synonyms) + + src2 = ChemSpider() + self.spi.add_source(src2) + self.assertIsInstance(self.spi.get_synonym_requests("other_compound")[0], Request) + self.assertIn("other_compound", self.spi.synonyms) + self.assertEqual(self.spi.get_synonym_requests("other_compound"), []) \ No newline at end of file