Archived
1
0

Merge branch 'feature/testing' into develop

This commit is contained in:
Jip J. Dekker 2014-06-04 19:44:50 +02:00
commit 475d7fc865
13 changed files with 214 additions and 38 deletions

15
.travis.yml Normal file
View File

@ -0,0 +1,15 @@
# Config file for automatic testing at travis-ci.org
language: python
python: 2.7
# command to install dependencies, e.g. pip install -r requirements.txt --use-mirrors
install:
- pip install Scrapy docopt
# command to run tests, e.g. python setup.py test
script:
- nosetests tests
notifications:
slack: descartes2:6sgCzx3PvrO9IIMwKxj12dDM

View File

@ -4,12 +4,13 @@ import re
from scrapy.exceptions import DropItem
class RemoveNonePipeline(object):
def __init__(self):
self.known_values = set()
pass
def process_item(self, item, spider):
@staticmethod
def process_item(item, spider):
"""
Processing the items so None values are replaced by empty strings
:param item: The incoming item
@ -21,8 +22,8 @@ class RemoveNonePipeline(object):
item[key] = ""
return item
class DuplicatePipeline(object):
class DuplicatePipeline(object):
def __init__(self):
self.known_values = set()
@ -35,17 +36,18 @@ class DuplicatePipeline(object):
"""
value = (item['attribute'], item['value'], item['conditions'])
if value in self.known_values:
raise DropItem("Duplicate item found: %s" % item) # #[todo] append sources of first item.
raise DropItem("Duplicate item found: %s" % item) # [todo] append sources of first item.
else:
self.known_values.add(value)
return item
class AttributeSelectionPipeline(object):
def __init__(self):
pass;
pass
def process_item(self, item, spider):
@staticmethod
def process_item(item, spider):
"""
The items are processed using the selected attribute list available in the spider,
items that don't match the selected items are dropped.

View File

@ -3,7 +3,7 @@
# For simplicity, this file contains only the most important settings by
# default. All the other settings are documented here:
#
# http://doc.scrapy.org/en/latest/topics/settings.html
# http://doc.scrapy.org/en/latest/topics/settings.html
#
BOT_NAME = 'FourmiCrawler'

View File

@ -1,9 +1,12 @@
from source import Source
import re
from scrapy import log
from scrapy.http import Request
from scrapy.selector import Selector
from source import Source
from FourmiCrawler.items import Result
import re
# [TODO] - Maybe clean up usage of '.extract()[0]', because of possible IndexError exception.
@ -58,9 +61,7 @@ class ChemSpider(Source):
prop_conditions = ''
# Test for properties without values, with one hardcoded exception
if (not re.match(r'^\d', prop_value) or
(prop_name == 'Polarizability' and
prop_value == '10-24cm3')):
if not re.match(r'^\d', prop_value) or (prop_name == 'Polarizability' and prop_value == '10-24cm3'):
continue
# Match for condition in parentheses

View File

@ -1,13 +1,16 @@
from source import Source
import re
from scrapy import log
from scrapy.http import Request
from scrapy.selector import Selector
from source import Source
from FourmiCrawler.items import Result
import re
# [TODO]: values can be '128.', perhaps remove the dot in that case?
# [TODO]: properties have references and comments which do not exist in the
# Result item, but should be included eventually.
# Result item, but should be included eventually.
class NIST(Source):
"""NIST Scraper plugin
@ -15,7 +18,7 @@ class NIST(Source):
This plugin manages searching for a chemical on the NIST website
and parsing the resulting page if the chemical exists on NIST.
"""
website = "http://webbook.nist.gov/*"
website = "http://webbook.nist.gov/*"
search = 'cgi/cbook.cgi?Name=%s&Units=SI&cTP=on'
@ -75,7 +78,7 @@ class NIST(Source):
requests.extend(self.parse_generic_data(table, summary))
else:
log.msg('NIST table: NOT SUPPORTED', level=log.WARNING)
continue #Assume unsupported
continue # Assume unsupported
return requests
def parse_generic_info(self, sel):
@ -103,7 +106,7 @@ class NIST(Source):
data['IUPAC Standard InChI'] = raw_inchi.extract()[0]
raw_inchikey = ul.xpath('li[strong="IUPAC Standard InChIKey:"]'
'/tt/text()')
'/tt/text()')
data['IUPAC Standard InChIKey'] = raw_inchikey.extract()[0]
raw_cas_number = ul.xpath('li[strong="CAS Registry Number:"]/text()')
@ -129,10 +132,10 @@ class NIST(Source):
results = []
for tr in table.xpath('tr[td]'):
extra_data_url = tr.xpath('td[last()][a="Individual data points"]'
'/a/@href').extract()
'/a/@href').extract()
if extra_data_url:
request = Request(url=self.website[:-1] + extra_data_url[0],
callback=self.parse_individual_datapoints)
callback=self.parse_individual_datapoints)
results.append(request)
continue
data = []
@ -180,7 +183,6 @@ class NIST(Source):
})
results.append(result)
return results
@staticmethod
@ -228,7 +230,8 @@ class NIST(Source):
return results
def parse_individual_datapoints(self, response):
@staticmethod
def parse_individual_datapoints(response):
"""Parses the page linked from aggregate data"""
sel = Selector(response)
table = sel.xpath('//table[@class="data"]')[0]

View File

@ -1,9 +1,11 @@
import re
from scrapy.http import Request
from scrapy import log
from source import Source
from scrapy.selector import Selector
from source import Source
from FourmiCrawler.items import Result
import re
class WikipediaParser(Source):
@ -36,7 +38,7 @@ class WikipediaParser(Source):
""" scrape data from infobox on wikipedia. """
items = []
#be sure to get chembox (wikipedia template)
# be sure to get chembox (wikipedia template)
tr_list = sel.xpath('.//table[@class="infobox bordered"]//td[not(@colspan)]'). \
xpath('normalize-space(string())')
prop_names = tr_list[::2]

View File

@ -9,8 +9,8 @@ class FourmiSpider(Spider):
A spider writen for the Fourmi Project which calls upon all available sources to request and scrape data.
"""
name = "FourmiSpider"
__sources = []
synonyms = []
_sources = []
synonyms = set()
def __init__(self, compound=None, selected_attributes=[".*"], *args, **kwargs):
"""
@ -19,8 +19,8 @@ class FourmiSpider(Spider):
:param selected_attributes: A list of regular expressions that the attributes should match.
"""
super(FourmiSpider, self).__init__(*args, **kwargs)
self.synonyms.append(compound)
self.selected_attributes = selected_attributes;
self.synonyms.add(compound)
self.selected_attributes = selected_attributes
def parse(self, response):
"""
@ -29,7 +29,7 @@ class FourmiSpider(Spider):
:param response: A Scrapy Response object that should be parsed
:return: A list of Result items and new Request to be handled by the scrapy core.
"""
for source in self.__sources:
for source in self._sources:
if re.match(source.website, response.url):
log.msg("Url: " + response.url + " -> Source: " + source.website, level=log.DEBUG)
return source.parse(response)
@ -42,10 +42,12 @@ class FourmiSpider(Spider):
:return: A list of Scrapy Request objects
"""
requests = []
for parser in self.__sources:
parser_requests = parser.new_compound_request(compound)
if parser_requests is not None:
requests.append(parser_requests)
if compound not in self.synonyms:
self.synonyms.add(compound)
for parser in self._sources:
parser_requests = parser.new_compound_request(compound)
if parser_requests is not None:
requests.append(parser_requests)
return requests
def start_requests(self):
@ -71,5 +73,5 @@ class FourmiSpider(Spider):
A function add a new Parser object to the list of available parsers.
:param source: A Source Object
"""
self.__sources.append(source)
self._sources.append(source)
source.set_spider(self)

View File

@ -1,5 +1,9 @@
# Fourmi
**Master branch**: [![Build Status](https://travis-ci.org/Recondor/Fourmi.svg?branch=master)](https://travis-ci.org/Recondor/Fourmi)
**Developing branch**: [![Build Status](https://travis-ci.org/Recondor/Fourmi.svg?branch=develop)](https://travis-ci.org/Recondor/Fourmi)
Fourmi is an web scraper for chemical substances. The program is designed to be
used as a search engine to search multiple chemical databases for a specific
substance. The program will produce all available attributes of the substance

View File

@ -1,4 +1,4 @@
#!/usr/bin/env python
# !/usr/bin/env python
"""
Fourmi, a web scraper build to search specific information for a given compound (and it's pseudonyms).

1
tests/__init__.py Normal file
View File

@ -0,0 +1 @@

52
tests/test_pipeline.py Normal file
View File

@ -0,0 +1,52 @@
import copy
import unittest
from scrapy.exceptions import DropItem
from FourmiCrawler import pipelines, spider, items
class TestPipelines(unittest.TestCase):
def setUp(self):
self.testItem = items.Result()
def test_none_pipeline(self):
# Testing the pipeline that replaces the None values in items.
self.testItem["value"] = "abc"
pipe = pipelines.RemoveNonePipeline()
processed = pipe.process_item(self.testItem, spider.FourmiSpider())
self.assertTrue(processed["value"] == "abc")
for key in self.testItem:
self.assertIsNotNone(processed[key])
if key is not "value":
self.assertIs(processed[key], "")
def test_duplicate_pipeline(self):
# Testing the pipeline that removes duplicates.
self.testItem["attribute"] = "test"
self.testItem["value"] = "test"
self.testItem["conditions"] = "test"
pipe = pipelines.DuplicatePipeline()
self.assertEqual(pipe.process_item(self.testItem, spider.FourmiSpider()), self.testItem)
self.assertRaises(DropItem, pipe.process_item, self.testItem, spider.FourmiSpider())
other_item = copy.deepcopy(self.testItem)
other_item["value"] = "test1"
self.assertEqual(pipe.process_item(other_item, spider.FourmiSpider()), other_item)
def test_attribute_selection(self):
# Testing the pipeline that selects attributes.
item1 = copy.deepcopy(self.testItem)
item2 = copy.deepcopy(self.testItem)
item1["attribute"] = "abd"
item2["attribute"] = "abc"
s = spider.FourmiSpider(selected_attributes=["a.d"])
pipe = pipelines.AttributeSelectionPipeline()
self.assertEqual(pipe.process_item(item1, s), item1)
self.assertRaises(DropItem, pipe.process_item, item2, s)

View File

@ -0,0 +1,33 @@
import unittest
from sourceloader import SourceLoader
class TestSourceloader(unittest.TestCase):
def setUp(self):
self.loader = SourceLoader()
def test_init(self):
# Test if sourceloader points to the right directory, where the sources are present.
self.assertIn("Source: Source", str(self.loader))
self.assertIn("Source: NIST", str(self.loader))
self.assertIn("Source: ChemSpider", str(self.loader))
self.assertIn("Source: WikipediaParser", str(self.loader))
def test_include(self):
# Tests for the include functionality.
self.loader.include(["So.rc.*"])
self.assertIn("Source: Source", str(self.loader))
self.assertNotIn("Source: NIST", str(self.loader))
self.assertNotIn("Source: ChemSpider", str(self.loader))
self.assertNotIn("Source: WikipediaParser", str(self.loader))
def test_exclude(self):
# Tests for the exclude functionality.
self.loader.exclude(["So.rc.*"])
self.assertNotIn("Source: Source", str(self.loader))
self.assertIn("Source: NIST", str(self.loader))
self.assertIn("Source: ChemSpider", str(self.loader))
self.assertIn("Source: WikipediaParser", str(self.loader))

61
tests/test_spider.py Normal file
View File

@ -0,0 +1,61 @@
import unittest
from scrapy.http import Request
from FourmiCrawler import spider
from FourmiCrawler.sources.ChemSpider import ChemSpider
from FourmiCrawler.sources.source import Source
class TestFoumiSpider(unittest.TestCase):
def setUp(self):
self.compound = "test_compound"
self.attributes = ["a.*", ".*a"]
self.spi = spider.FourmiSpider(self.compound, self.attributes)
def test_init(self):
# Test the initiation of the Fourmi spider
self.assertIn(self.compound, self.spi.synonyms)
for attr in self.attributes:
self.assertIn(attr, self.spi.selected_attributes)
def test_add_source(self):
# Testing the source adding function of the Fourmi spider
src = Source()
self.spi.add_source(src)
self.assertIn(src, self.spi._sources)
def test_add_sources(self):
# Testing the function that adds multiple sources
srcs = [Source(), Source(), Source()]
self.spi.add_sources(srcs)
for src in srcs:
self.assertIn(src, self.spi._sources)
def test_start_requests(self):
# A test for the function that generates the start requests
self.spi._sources = []
src = Source()
self.spi.add_source(src)
self.assertEqual(self.spi.start_requests(), [])
src2 = ChemSpider()
self.spi.add_source(src2)
self.assertIsNotNone(self.spi.start_requests())
def test_synonym_requests(self):
# A test for the synonym request function
self.spi._sources = []
src = Source()
self.spi.add_source(src)
self.assertEqual(self.spi.get_synonym_requests("new_compound"), [])
self.assertIn("new_compound", self.spi.synonyms)
src2 = ChemSpider()
self.spi.add_source(src2)
self.assertIsInstance(self.spi.get_synonym_requests("other_compound")[0], Request)
self.assertIn("other_compound", self.spi.synonyms)
self.assertEqual(self.spi.get_synonym_requests("other_compound"), [])