Merge branch 'feature/GUI' of https://github.com/Recondor/Fourmi into feature/GUI

2014-06-11 15:13:16 +02:00 · 2014-06-11 15:13:16 +02:00 · bd12216ad4
commit bd12216ad4
parent b97a7e22d4 75a6b5e345
16 changed files with 323 additions and 62 deletions
--- a/.travis.yml
+++ b/.travis.yml
@ -0,0 +1,15 @@
+# Config file for automatic testing at travis-ci.org
+
+language: python
+python: 2.7
+
+# command to install dependencies, e.g. pip install -r requirements.txt --use-mirrors
+install:
+  - pip install Scrapy docopt
+
+# command to run tests, e.g. python setup.py test
+script:
+  - nosetests tests
+
+notifications:
+  slack: descartes2:6sgCzx3PvrO9IIMwKxj12dDM
--- a/FourmiCrawler/items.py
+++ b/FourmiCrawler/items.py
@ -1,6 +1,4 @@
-# Define here the models for your scraped items
-#
-# See documentation in:
+# For more information on item definitions, see the Scrapy documentation in:
 # http://doc.scrapy.org/en/latest/topics/items.html

 from scrapy.item import Item, Field
--- a/FourmiCrawler/pipelines.py
+++ b/FourmiCrawler/pipelines.py
@ -1,16 +1,16 @@
-# Define your item pipelines here
-#
-# Don't forget to add your pipeline to the ITEM_PIPELINES setting
-# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
+# For more information on item pipelines, see the Scrapy documentation in:
+# http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 import re
+
 from scrapy.exceptions import DropItem

+
 class RemoveNonePipeline(object):
-
    def __init__(self):
-        self.known_values = set()
+        pass

-    def process_item(self, item, spider):
+    @staticmethod
+    def process_item(item, spider):
        """
        Processing the items so None values are replaced by empty strings
        :param item: The incoming item
@ -22,8 +22,8 @@ class RemoveNonePipeline(object):
                item[key] = ""
        return item

-class DuplicatePipeline(object):

+class DuplicatePipeline(object):
    def __init__(self):
        self.known_values = set()

@ -36,17 +36,18 @@ class DuplicatePipeline(object):
        """
        value = (item['attribute'], item['value'], item['conditions'])
        if value in self.known_values:
-            raise DropItem("Duplicate item found: %s" % item) # #[todo] append sources of first item.
+            raise DropItem("Duplicate item found: %s" % item)  # [todo] append sources of first item.
        else:
            self.known_values.add(value)
            return item

+
 class AttributeSelectionPipeline(object):
-
    def __init__(self):
-        pass;
+        pass

-    def process_item(self, item, spider):
+    @staticmethod
+    def process_item(item, spider):
        """
        The items are processed using the selected attribute list available in the spider,
        items that don't match the selected items are dropped.
--- a/FourmiCrawler/settings.py
+++ b/FourmiCrawler/settings.py
@ -3,7 +3,7 @@
 # For simplicity, this file contains only the most important settings by
 # default. All the other settings are documented here:
 #
-#     http://doc.scrapy.org/en/latest/topics/settings.html
+# http://doc.scrapy.org/en/latest/topics/settings.html
 #

 BOT_NAME = 'FourmiCrawler'
--- a/FourmiCrawler/sources/ChemSpider.py
+++ b/FourmiCrawler/sources/ChemSpider.py
@ -1,9 +1,12 @@
-from source import Source
+import re
+
 from scrapy import log
 from scrapy.http import Request
 from scrapy.selector import Selector
+
+from source import Source
 from FourmiCrawler.items import Result
-import re
+

 # [TODO] - Maybe clean up usage of '.extract()[0]', because of possible IndexError exception.

@ -58,9 +61,7 @@ class ChemSpider(Source):
            prop_conditions = ''

            # Test for properties without values, with one hardcoded exception
-            if (not re.match(r'^\d', prop_value) or
-                    (prop_name == 'Polarizability' and
-                    prop_value == '10-24cm3')):
+            if not re.match(r'^\d', prop_value) or (prop_name == 'Polarizability' and prop_value == '10-24cm3'):
                continue

            # Match for condition in parentheses
--- a/FourmiCrawler/sources/NIST.py
+++ b/FourmiCrawler/sources/NIST.py
@ -1,13 +1,16 @@
-from source import Source
+import re
+
 from scrapy import log
 from scrapy.http import Request
 from scrapy.selector import Selector
+
+from source import Source
 from FourmiCrawler.items import Result
-import re
+

 # [TODO]: values can be '128.', perhaps remove the dot in that case?
 # [TODO]: properties have references and comments which do not exist in the
-#         Result item, but should be included eventually.
+# Result item, but should be included eventually.

 class NIST(Source):
    """NIST Scraper plugin
@ -15,7 +18,7 @@ class NIST(Source):
    This plugin manages searching for a chemical on the NIST website
    and parsing the resulting page if the chemical exists on NIST.
    """
-    website = "http://webbook.nist.gov/*"  
+    website = "http://webbook.nist.gov/*"

    search = 'cgi/cbook.cgi?Name=%s&Units=SI&cTP=on'

@ -75,7 +78,7 @@ class NIST(Source):
                requests.extend(self.parse_generic_data(table, summary))
            else:
                log.msg('NIST table: NOT SUPPORTED', level=log.WARNING)
-                continue #Assume unsupported
+                continue  # Assume unsupported
        return requests

    def parse_generic_info(self, sel):
@ -103,7 +106,7 @@ class NIST(Source):
        data['IUPAC Standard InChI'] = raw_inchi.extract()[0]

        raw_inchikey = ul.xpath('li[strong="IUPAC Standard InChIKey:"]'
-                            '/tt/text()')
+                                '/tt/text()')
        data['IUPAC Standard InChIKey'] = raw_inchikey.extract()[0]

        raw_cas_number = ul.xpath('li[strong="CAS Registry Number:"]/text()')
@ -129,10 +132,10 @@ class NIST(Source):
        results = []
        for tr in table.xpath('tr[td]'):
            extra_data_url = tr.xpath('td[last()][a="Individual data points"]'
-                                '/a/@href').extract()
+                                      '/a/@href').extract()
            if extra_data_url:
                request = Request(url=self.website[:-1] + extra_data_url[0],
-                    callback=self.parse_individual_datapoints)
+                                  callback=self.parse_individual_datapoints)
                results.append(request)
                continue
            data = []
@ -180,7 +183,6 @@ class NIST(Source):
            })
            results.append(result)

-
        return results

    @staticmethod
@ -228,7 +230,8 @@ class NIST(Source):

        return results

-    def parse_individual_datapoints(self, response):
+    @staticmethod
+    def parse_individual_datapoints(response):
        """Parses the page linked from aggregate data"""
        sel = Selector(response)
        table = sel.xpath('//table[@class="data"]')[0]
--- a/FourmiCrawler/sources/WikipediaParser.py
+++ b/FourmiCrawler/sources/WikipediaParser.py
@ -1,9 +1,11 @@
+import re
+
 from scrapy.http import Request
 from scrapy import log
-from source import Source
 from scrapy.selector import Selector
+
+from source import Source
 from FourmiCrawler.items import Result
-import re


 class WikipediaParser(Source):
@ -36,7 +38,7 @@ class WikipediaParser(Source):
        """ scrape data from infobox on wikipedia. """
        items = []

-        #be sure to get chembox (wikipedia template)
+        # be sure to get chembox (wikipedia template)
        tr_list = sel.xpath('.//table[@class="infobox bordered"]//td[not(@colspan)]'). \
            xpath('normalize-space(string())')
        prop_names = tr_list[::2]
--- a/FourmiCrawler/sources/source.py
+++ b/FourmiCrawler/sources/source.py
@ -7,15 +7,32 @@ class Source:
    _spider = None

    def __init__(self):
+        """
+        Initiation of a new Source
+        """
        pass

-    def parse(self, reponse):
-        log.msg("The parse function of the empty parser was used.", level=log.WARNING)
+    def parse(self, response):
+        """
+        This function should be able to parse all Scrapy Response objects with a URL matching the website Regex.
+        :param response: A Scrapy Response object
+        :return: A list of Result items and new Scrapy Requests
+        """
+        log.msg("The parse function of the empty source was used.", level=log.WARNING)
        pass

    def new_compound_request(self, compound):
+        """
+        This function should return a Scrapy Request for the given compound request.
+        :param compound: A compound name.
+        :return: A new Scrapy Request
+        """
        # return Request(url=self.website[:-1] + compound, callback=self.parse)
        pass

    def set_spider(self, spider):
+        """
+        A Function to save the associated spider.
+        :param spider: A FourmiSpider object
+        """
        self._spider = spider
--- a/FourmiCrawler/spider.py
+++ b/FourmiCrawler/spider.py
@ -1,43 +1,77 @@
+import re
+
 from scrapy.spider import Spider
 from scrapy import log
-import re


 class FourmiSpider(Spider):
+    """
+    A spider writen for the Fourmi Project which calls upon all available sources to request and scrape data.
+    """
    name = "FourmiSpider"
-    __parsers = []
-    synonyms = []
+    _sources = []
+    synonyms = set()

    def __init__(self, compound=None, selected_attributes=[".*"], *args, **kwargs):
+        """
+        Initiation of the Spider
+        :param compound: compound that will be searched.
+        :param selected_attributes: A list of regular expressions that the attributes should match.
+        """
        super(FourmiSpider, self).__init__(*args, **kwargs)
-        self.synonyms.append(compound)
-        self.selected_attributes = selected_attributes;
+        self.synonyms.add(compound)
+        self.selected_attributes = selected_attributes

-    def parse(self, reponse):
-        for parser in self.__parsers:
-            if re.match(parser.website, reponse.url):
-                log.msg("Url: " + reponse.url + " -> Source: " + parser.website, level=log.DEBUG)
-                return parser.parse(reponse)
+    def parse(self, response):
+        """
+        The function that is called when a response to a request is available. This function distributes this to a
+        source which should be able to handle parsing the data.
+        :param response: A Scrapy Response object that should be parsed
+        :return: A list of Result items and new Request to be handled by the scrapy core.
+        """
+        for source in self._sources:
+            if re.match(source.website, response.url):
+                log.msg("Url: " + response.url + " -> Source: " + source.website, level=log.DEBUG)
+                return source.parse(response)
        return None

    def get_synonym_requests(self, compound):
+        """
+        A function that generates new Scrapy Request for each source given a new synonym of a compound.
+        :param compound: A compound name
+        :return: A list of Scrapy Request objects
+        """
        requests = []
-        for parser in self.__parsers:
-            parser_requests = parser.new_compound_request(compound)
-            if parser_requests is not None:
-                requests.append(parser_requests)
+        if compound not in self.synonyms:
+            self.synonyms.add(compound)
+            for parser in self._sources:
+                parser_requests = parser.new_compound_request(compound)
+                if parser_requests is not None:
+                    requests.append(parser_requests)
        return requests

    def start_requests(self):
+        """
+        The function called by Scrapy for it's first Requests
+        :return: A list of Scrapy Request generated from the known synonyms using the available sources.
+        """
        requests = []
        for synonym in self.synonyms:
            requests.extend(self.get_synonym_requests(synonym))
        return requests

-    def add_parsers(self, parsers):
-        for parser in parsers:
-            self.add_parser(parser)
+    def add_sources(self, sources):
+        """
+        A function to add a new Parser objects to the list of available sources.
+        :param sources: A list of Source Objects.
+        """
+        for parser in sources:
+            self.add_source(parser)

-    def add_parser(self, parser):
-        self.__parsers.append(parser)
-        parser.set_spider(self)
+    def add_source(self, source):
+        """
+        A function add a new Parser object to the list of available parsers.
+        :param source: A Source Object
+        """
+        self._sources.append(source)
+        source.set_spider(self)
--- a/README.md
+++ b/README.md
@ -1,5 +1,9 @@
 # Fourmi

+**Master branch**: [![Build Status](https://travis-ci.org/Recondor/Fourmi.svg?branch=master)](https://travis-ci.org/Recondor/Fourmi)
+
+**Developing branch**: [![Build Status](https://travis-ci.org/Recondor/Fourmi.svg?branch=develop)](https://travis-ci.org/Recondor/Fourmi)
+
 Fourmi is an web scraper for chemical substances. The program is designed to be
 used as a search engine to search multiple chemical databases for a specific
 substance. The program will produce all available attributes of the substance
--- a/fourmi.py
+++ b/fourmi.py
@ -1,4 +1,4 @@
-#!/usr/bin/env python
+# !/usr/bin/env python
 """
 Fourmi, a web scraper build to search specific information for a given compound (and it's pseudonyms).

@ -33,9 +33,16 @@ from FourmiCrawler.spider import FourmiSpider
 from sourceloader import SourceLoader


-def setup_crawler(searchable, settings, source_loader, attributes):
-    spider = FourmiSpider(compound=searchable, selected_attributes=attributes)
-    spider.add_parsers(source_loader.sources)
+def setup_crawler(compound, settings, source_loader, attributes):
+    """
+    This function prepares and start the crawler which starts the actual search on the internet
+    :param compound: The compound which should be searched
+    :param settings: A scrapy settings object
+    :param source_loader: A fully functional SourceLoader object which contains only the sources that should be used.
+    :param attributes: A list of regular expressions which the attribute names should match.
+    """
+    spider = FourmiSpider(compound=compound, selected_attributes=attributes)
+    spider.add_sources(source_loader.sources)
    crawler = Crawler(settings)
    crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
    crawler.configure()
@ -44,8 +51,13 @@ def setup_crawler(searchable, settings, source_loader, attributes):


 def scrapy_settings_manipulation(docopt_arguments):
+    """
+    This function manipulates the Scrapy settings that normally would be set in the settings file. In the Fourmi
+    project these are command line arguments.
+    :param docopt_arguments: A dictionary generated by docopt containing all CLI arguments.
+    """
    settings = get_project_settings()
-    # [todo] - add at least a warning for files that already exist
+
    if docopt_arguments["--output"] != 'result.*format*':
        settings.overrides["FEED_URI"] = docopt_arguments["--output"]
    elif docopt_arguments["--format"] == "jsonlines":
@ -60,6 +72,10 @@ def scrapy_settings_manipulation(docopt_arguments):


 def start_log(docopt_arguments):
+    """
+    This function starts the logging functionality of Scrapy using the settings given by the CLI.
+    :param docopt_arguments:  A dictionary generated by docopt containing all CLI arguments.
+    """
    if docopt_arguments["--log"] is not None:
        if docopt_arguments["--verbose"]:
            log.start(logfile=docopt_arguments["--log"], logstdout=False, loglevel=log.DEBUG)
@ -73,14 +89,20 @@ def start_log(docopt_arguments):


 def search(docopt_arguments, source_loader):
+    """
+    The function that facilitates the search for a specific compound.
+    :param docopt_arguments: A dictionary generated by docopt containing all CLI arguments.
+    :param source_loader: An initiated SourceLoader object pointed at the directory with the sources.
+    """
    start_log(docopt_arguments)
    settings = scrapy_settings_manipulation(docopt_arguments)
    setup_crawler(docopt_arguments["<compound>"], settings, source_loader, docopt_arguments["--attributes"].split(','))
    reactor.run()


+# The start for the Fourmi Command Line interface.
 if __name__ == '__main__':
-    arguments = docopt.docopt(__doc__, version='Fourmi - V0.4.0')
+    arguments = docopt.docopt(__doc__, version='Fourmi - V0.4.1')
    loader = SourceLoader()

    if arguments["--include"]:
--- a/sourceloader.py
+++ b/sourceloader.py
@ -1,6 +1,7 @@
 import inspect
 import os
 import re
+
 from FourmiCrawler.sources.source import Source


@ -8,6 +9,10 @@ class SourceLoader:
    sources = []

    def __init__(self, rel_dir="FourmiCrawler/sources"):
+        """
+        The initiation of a SourceLoader, selects and indexes a directory for usable sources.
+        :param rel_dir: A relative path to a directory.
+        """
        path = os.path.dirname(os.path.abspath(__file__))
        path += "/" + rel_dir
        known_parser = set()
@ -21,18 +26,30 @@ class SourceLoader:
                    known_parser.add(cls)

    def include(self, source_names):
+        """
+        This function excludes all sources that don't match the given regular expressions.
+        :param source_names: A list of regular expression (strings)
+        """
        new = set()
        for name in source_names:
            new.update([src for src in self.sources if re.match(name, src.__class__.__name__)])
        self.sources = list(new)

    def exclude(self, source_names):
+        """
+        This function excludes all sources that match the given regular expressions.
+        :param source_names: A list of regular expression (strings)
+        """
        exclude = []
        for name in source_names:
            exclude.extend([src for src in self.sources if re.match(name, src.__class__.__name__)])
        self.sources = [src for src in self.sources if src not in exclude]

    def __str__(self):
+        """
+        This function returns a string with all sources currently available in the SourceLoader.
+        :return: a string with all available sources.
+        """
        string = ""
        for src in self.sources:
            string += "Source: " + src.__class__.__name__
--- a/tests/init.py
+++ b/tests/init.py
@ -0,0 +1 @@
+
--- a/tests/test_pipeline.py
+++ b/tests/test_pipeline.py
@ -0,0 +1,52 @@
+import copy
+import unittest
+
+from scrapy.exceptions import DropItem
+
+from FourmiCrawler import pipelines, spider, items
+
+
+class TestPipelines(unittest.TestCase):
+    def setUp(self):
+        self.testItem = items.Result()
+
+    def test_none_pipeline(self):
+        # Testing the pipeline that replaces the None values in items.
+        self.testItem["value"] = "abc"
+        pipe = pipelines.RemoveNonePipeline()
+        processed = pipe.process_item(self.testItem, spider.FourmiSpider())
+
+        self.assertTrue(processed["value"] == "abc")
+
+        for key in self.testItem:
+            self.assertIsNotNone(processed[key])
+            if key is not "value":
+                self.assertIs(processed[key], "")
+
+    def test_duplicate_pipeline(self):
+        # Testing the pipeline that removes duplicates.
+        self.testItem["attribute"] = "test"
+        self.testItem["value"] = "test"
+        self.testItem["conditions"] = "test"
+
+        pipe = pipelines.DuplicatePipeline()
+        self.assertEqual(pipe.process_item(self.testItem, spider.FourmiSpider()), self.testItem)
+        self.assertRaises(DropItem, pipe.process_item, self.testItem, spider.FourmiSpider())
+
+        other_item = copy.deepcopy(self.testItem)
+        other_item["value"] = "test1"
+        self.assertEqual(pipe.process_item(other_item, spider.FourmiSpider()), other_item)
+
+    def test_attribute_selection(self):
+        # Testing the pipeline that selects attributes.
+        item1 = copy.deepcopy(self.testItem)
+        item2 = copy.deepcopy(self.testItem)
+
+        item1["attribute"] = "abd"
+        item2["attribute"] = "abc"
+
+        s = spider.FourmiSpider(selected_attributes=["a.d"])
+        pipe = pipelines.AttributeSelectionPipeline()
+
+        self.assertEqual(pipe.process_item(item1, s), item1)
+        self.assertRaises(DropItem, pipe.process_item, item2, s)
--- a/tests/test_sourceloader.py
+++ b/tests/test_sourceloader.py
@ -0,0 +1,33 @@
+import unittest
+
+from sourceloader import SourceLoader
+
+
+class TestSourceloader(unittest.TestCase):
+    def setUp(self):
+        self.loader = SourceLoader()
+
+    def test_init(self):
+        # Test if sourceloader points to the right directory, where the sources are present.
+        self.assertIn("Source: Source", str(self.loader))
+        self.assertIn("Source: NIST", str(self.loader))
+        self.assertIn("Source: ChemSpider", str(self.loader))
+        self.assertIn("Source: WikipediaParser", str(self.loader))
+
+    def test_include(self):
+        # Tests for the include functionality.
+        self.loader.include(["So.rc.*"])
+
+        self.assertIn("Source: Source", str(self.loader))
+        self.assertNotIn("Source: NIST", str(self.loader))
+        self.assertNotIn("Source: ChemSpider", str(self.loader))
+        self.assertNotIn("Source: WikipediaParser", str(self.loader))
+
+    def test_exclude(self):
+        # Tests for the exclude functionality.
+        self.loader.exclude(["So.rc.*"])
+
+        self.assertNotIn("Source: Source", str(self.loader))
+        self.assertIn("Source: NIST", str(self.loader))
+        self.assertIn("Source: ChemSpider", str(self.loader))
+        self.assertIn("Source: WikipediaParser", str(self.loader))
--- a/tests/test_spider.py
+++ b/tests/test_spider.py
@ -0,0 +1,61 @@
+import unittest
+
+from scrapy.http import Request
+
+from FourmiCrawler import spider
+from FourmiCrawler.sources.ChemSpider import ChemSpider
+from FourmiCrawler.sources.source import Source
+
+
+class TestFoumiSpider(unittest.TestCase):
+    def setUp(self):
+        self.compound = "test_compound"
+        self.attributes = ["a.*", ".*a"]
+        self.spi = spider.FourmiSpider(self.compound, self.attributes)
+
+    def test_init(self):
+        # Test the initiation of the Fourmi spider
+        self.assertIn(self.compound, self.spi.synonyms)
+        for attr in self.attributes:
+            self.assertIn(attr, self.spi.selected_attributes)
+
+    def test_add_source(self):
+        # Testing the source adding function of the Fourmi spider
+        src = Source()
+        self.spi.add_source(src)
+        self.assertIn(src, self.spi._sources)
+
+    def test_add_sources(self):
+        # Testing the function that adds multiple sources
+        srcs = [Source(), Source(), Source()]
+        self.spi.add_sources(srcs)
+
+        for src in srcs:
+            self.assertIn(src, self.spi._sources)
+
+    def test_start_requests(self):
+        # A test for the function that generates the start requests
+        self.spi._sources = []
+
+        src = Source()
+        self.spi.add_source(src)
+        self.assertEqual(self.spi.start_requests(), [])
+
+        src2 = ChemSpider()
+        self.spi.add_source(src2)
+        self.assertIsNotNone(self.spi.start_requests())
+
+    def test_synonym_requests(self):
+        # A test for the synonym request function
+        self.spi._sources = []
+
+        src = Source()
+        self.spi.add_source(src)
+        self.assertEqual(self.spi.get_synonym_requests("new_compound"), [])
+        self.assertIn("new_compound", self.spi.synonyms)
+
+        src2 = ChemSpider()
+        self.spi.add_source(src2)
+        self.assertIsInstance(self.spi.get_synonym_requests("other_compound")[0], Request)
+        self.assertIn("other_compound", self.spi.synonyms)
+        self.assertEqual(self.spi.get_synonym_requests("other_compound"), [])