Archived
1
0

Merge branch 'develop' into feature/PubChem

This commit is contained in:
Jip J. Dekker 2014-06-04 19:54:56 +02:00
commit bf1822059f
13 changed files with 215 additions and 39 deletions

15
.travis.yml Normal file
View File

@ -0,0 +1,15 @@
# Config file for automatic testing at travis-ci.org
language: python
python: 2.7
# command to install dependencies, e.g. pip install -r requirements.txt --use-mirrors
install:
- pip install Scrapy docopt
# command to run tests, e.g. python setup.py test
script:
- nosetests tests
notifications:
slack: descartes2:6sgCzx3PvrO9IIMwKxj12dDM

View File

@ -4,12 +4,13 @@ import re
from scrapy.exceptions import DropItem from scrapy.exceptions import DropItem
class RemoveNonePipeline(object): class RemoveNonePipeline(object):
def __init__(self): def __init__(self):
self.known_values = set() pass
def process_item(self, item, spider): @staticmethod
def process_item(item, spider):
""" """
Processing the items so None values are replaced by empty strings Processing the items so None values are replaced by empty strings
:param item: The incoming item :param item: The incoming item
@ -21,8 +22,8 @@ class RemoveNonePipeline(object):
item[key] = "" item[key] = ""
return item return item
class DuplicatePipeline(object):
class DuplicatePipeline(object):
def __init__(self): def __init__(self):
self.known_values = set() self.known_values = set()
@ -35,17 +36,18 @@ class DuplicatePipeline(object):
""" """
value = (item['attribute'], item['value'], item['conditions']) value = (item['attribute'], item['value'], item['conditions'])
if value in self.known_values: if value in self.known_values:
raise DropItem("Duplicate item found: %s" % item) # #[todo] append sources of first item. raise DropItem("Duplicate item found: %s" % item) # [todo] append sources of first item.
else: else:
self.known_values.add(value) self.known_values.add(value)
return item return item
class AttributeSelectionPipeline(object): class AttributeSelectionPipeline(object):
def __init__(self): def __init__(self):
pass; pass
def process_item(self, item, spider): @staticmethod
def process_item(item, spider):
""" """
The items are processed using the selected attribute list available in the spider, The items are processed using the selected attribute list available in the spider,
items that don't match the selected items are dropped. items that don't match the selected items are dropped.

View File

@ -3,7 +3,7 @@
# For simplicity, this file contains only the most important settings by # For simplicity, this file contains only the most important settings by
# default. All the other settings are documented here: # default. All the other settings are documented here:
# #
# http://doc.scrapy.org/en/latest/topics/settings.html # http://doc.scrapy.org/en/latest/topics/settings.html
# #
BOT_NAME = 'FourmiCrawler' BOT_NAME = 'FourmiCrawler'

View File

@ -1,9 +1,12 @@
from source import Source import re
from scrapy import log from scrapy import log
from scrapy.http import Request from scrapy.http import Request
from scrapy.selector import Selector from scrapy.selector import Selector
from source import Source
from FourmiCrawler.items import Result from FourmiCrawler.items import Result
import re
# [TODO] - Maybe clean up usage of '.extract()[0]', because of possible IndexError exception. # [TODO] - Maybe clean up usage of '.extract()[0]', because of possible IndexError exception.
@ -58,9 +61,7 @@ class ChemSpider(Source):
prop_conditions = '' prop_conditions = ''
# Test for properties without values, with one hardcoded exception # Test for properties without values, with one hardcoded exception
if (not re.match(r'^\d', prop_value) or if not re.match(r'^\d', prop_value) or (prop_name == 'Polarizability' and prop_value == '10-24cm3'):
(prop_name == 'Polarizability' and
prop_value == '10-24cm3')):
continue continue
# Match for condition in parentheses # Match for condition in parentheses

View File

@ -1,13 +1,16 @@
from source import Source import re
from scrapy import log from scrapy import log
from scrapy.http import Request from scrapy.http import Request
from scrapy.selector import Selector from scrapy.selector import Selector
from source import Source
from FourmiCrawler.items import Result from FourmiCrawler.items import Result
import re
# [TODO]: values can be '128.', perhaps remove the dot in that case? # [TODO]: values can be '128.', perhaps remove the dot in that case?
# [TODO]: properties have references and comments which do not exist in the # [TODO]: properties have references and comments which do not exist in the
# Result item, but should be included eventually. # Result item, but should be included eventually.
class NIST(Source): class NIST(Source):
"""NIST Scraper plugin """NIST Scraper plugin
@ -15,7 +18,7 @@ class NIST(Source):
This plugin manages searching for a chemical on the NIST website This plugin manages searching for a chemical on the NIST website
and parsing the resulting page if the chemical exists on NIST. and parsing the resulting page if the chemical exists on NIST.
""" """
website = "http://webbook.nist.gov/*" website = "http://webbook.nist.gov/*"
search = 'cgi/cbook.cgi?Name=%s&Units=SI&cTP=on' search = 'cgi/cbook.cgi?Name=%s&Units=SI&cTP=on'
@ -75,7 +78,7 @@ class NIST(Source):
requests.extend(self.parse_generic_data(table, summary)) requests.extend(self.parse_generic_data(table, summary))
else: else:
log.msg('NIST table: NOT SUPPORTED', level=log.WARNING) log.msg('NIST table: NOT SUPPORTED', level=log.WARNING)
continue #Assume unsupported continue # Assume unsupported
return requests return requests
def parse_generic_info(self, sel): def parse_generic_info(self, sel):
@ -103,7 +106,7 @@ class NIST(Source):
data['IUPAC Standard InChI'] = raw_inchi.extract()[0] data['IUPAC Standard InChI'] = raw_inchi.extract()[0]
raw_inchikey = ul.xpath('li[strong="IUPAC Standard InChIKey:"]' raw_inchikey = ul.xpath('li[strong="IUPAC Standard InChIKey:"]'
'/tt/text()') '/tt/text()')
data['IUPAC Standard InChIKey'] = raw_inchikey.extract()[0] data['IUPAC Standard InChIKey'] = raw_inchikey.extract()[0]
raw_cas_number = ul.xpath('li[strong="CAS Registry Number:"]/text()') raw_cas_number = ul.xpath('li[strong="CAS Registry Number:"]/text()')
@ -129,10 +132,10 @@ class NIST(Source):
results = [] results = []
for tr in table.xpath('tr[td]'): for tr in table.xpath('tr[td]'):
extra_data_url = tr.xpath('td[last()][a="Individual data points"]' extra_data_url = tr.xpath('td[last()][a="Individual data points"]'
'/a/@href').extract() '/a/@href').extract()
if extra_data_url: if extra_data_url:
request = Request(url=self.website[:-1] + extra_data_url[0], request = Request(url=self.website[:-1] + extra_data_url[0],
callback=self.parse_individual_datapoints) callback=self.parse_individual_datapoints)
results.append(request) results.append(request)
continue continue
data = [] data = []
@ -180,7 +183,6 @@ class NIST(Source):
}) })
results.append(result) results.append(result)
return results return results
@staticmethod @staticmethod
@ -228,7 +230,8 @@ class NIST(Source):
return results return results
def parse_individual_datapoints(self, response): @staticmethod
def parse_individual_datapoints(response):
"""Parses the page linked from aggregate data""" """Parses the page linked from aggregate data"""
sel = Selector(response) sel = Selector(response)
table = sel.xpath('//table[@class="data"]')[0] table = sel.xpath('//table[@class="data"]')[0]

View File

@ -1,9 +1,11 @@
import re
from scrapy.http import Request from scrapy.http import Request
from scrapy import log from scrapy import log
from source import Source
from scrapy.selector import Selector from scrapy.selector import Selector
from source import Source
from FourmiCrawler.items import Result from FourmiCrawler.items import Result
import re
class WikipediaParser(Source): class WikipediaParser(Source):
@ -36,7 +38,7 @@ class WikipediaParser(Source):
""" scrape data from infobox on wikipedia. """ """ scrape data from infobox on wikipedia. """
items = [] items = []
#be sure to get chembox (wikipedia template) # be sure to get chembox (wikipedia template)
tr_list = sel.xpath('.//table[@class="infobox bordered"]//td[not(@colspan)]'). \ tr_list = sel.xpath('.//table[@class="infobox bordered"]//td[not(@colspan)]'). \
xpath('normalize-space(string())') xpath('normalize-space(string())')
prop_names = tr_list[::2] prop_names = tr_list[::2]

View File

@ -9,8 +9,8 @@ class FourmiSpider(Spider):
A spider writen for the Fourmi Project which calls upon all available sources to request and scrape data. A spider writen for the Fourmi Project which calls upon all available sources to request and scrape data.
""" """
name = "FourmiSpider" name = "FourmiSpider"
__sources = [] _sources = []
synonyms = [] synonyms = set()
def __init__(self, compound=None, selected_attributes=[".*"], *args, **kwargs): def __init__(self, compound=None, selected_attributes=[".*"], *args, **kwargs):
""" """
@ -19,8 +19,8 @@ class FourmiSpider(Spider):
:param selected_attributes: A list of regular expressions that the attributes should match. :param selected_attributes: A list of regular expressions that the attributes should match.
""" """
super(FourmiSpider, self).__init__(*args, **kwargs) super(FourmiSpider, self).__init__(*args, **kwargs)
self.synonyms.append(compound) self.synonyms.add(compound)
self.selected_attributes = selected_attributes; self.selected_attributes = selected_attributes
def parse(self, response): def parse(self, response):
""" """
@ -29,7 +29,7 @@ class FourmiSpider(Spider):
:param response: A Scrapy Response object that should be parsed :param response: A Scrapy Response object that should be parsed
:return: A list of Result items and new Request to be handled by the scrapy core. :return: A list of Result items and new Request to be handled by the scrapy core.
""" """
for source in self.__sources: for source in self._sources:
if re.match(source.website, response.url): if re.match(source.website, response.url):
log.msg("Url: " + response.url + " -> Source: " + source.website, level=log.DEBUG) log.msg("Url: " + response.url + " -> Source: " + source.website, level=log.DEBUG)
return source.parse(response) return source.parse(response)
@ -42,10 +42,12 @@ class FourmiSpider(Spider):
:return: A list of Scrapy Request objects :return: A list of Scrapy Request objects
""" """
requests = [] requests = []
for parser in self.__sources: if compound not in self.synonyms:
parser_requests = parser.new_compound_request(compound) self.synonyms.add(compound)
if parser_requests is not None: for parser in self._sources:
requests.append(parser_requests) parser_requests = parser.new_compound_request(compound)
if parser_requests is not None:
requests.append(parser_requests)
return requests return requests
def start_requests(self): def start_requests(self):
@ -71,5 +73,5 @@ class FourmiSpider(Spider):
A function add a new Parser object to the list of available parsers. A function add a new Parser object to the list of available parsers.
:param source: A Source Object :param source: A Source Object
""" """
self.__sources.append(source) self._sources.append(source)
source.set_spider(self) source.set_spider(self)

View File

@ -1,5 +1,9 @@
# Fourmi # Fourmi
**Master branch**: [![Build Status](https://travis-ci.org/Recondor/Fourmi.svg?branch=master)](https://travis-ci.org/Recondor/Fourmi)
**Developing branch**: [![Build Status](https://travis-ci.org/Recondor/Fourmi.svg?branch=develop)](https://travis-ci.org/Recondor/Fourmi)
Fourmi is an web scraper for chemical substances. The program is designed to be Fourmi is an web scraper for chemical substances. The program is designed to be
used as a search engine to search multiple chemical databases for a specific used as a search engine to search multiple chemical databases for a specific
substance. The program will produce all available attributes of the substance substance. The program will produce all available attributes of the substance

View File

@ -1,4 +1,4 @@
#!/usr/bin/env python # !/usr/bin/env python
""" """
Fourmi, a web scraper build to search specific information for a given compound (and it's pseudonyms). Fourmi, a web scraper build to search specific information for a given compound (and it's pseudonyms).
@ -102,7 +102,7 @@ def search(docopt_arguments, source_loader):
# The start for the Fourmi Command Line interface. # The start for the Fourmi Command Line interface.
if __name__ == '__main__': if __name__ == '__main__':
arguments = docopt.docopt(__doc__, version='Fourmi - V0.4.0') arguments = docopt.docopt(__doc__, version='Fourmi - V0.4.1')
loader = SourceLoader() loader = SourceLoader()
if arguments["--include"]: if arguments["--include"]:

1
tests/__init__.py Normal file
View File

@ -0,0 +1 @@

52
tests/test_pipeline.py Normal file
View File

@ -0,0 +1,52 @@
import copy
import unittest
from scrapy.exceptions import DropItem
from FourmiCrawler import pipelines, spider, items
class TestPipelines(unittest.TestCase):
def setUp(self):
self.testItem = items.Result()
def test_none_pipeline(self):
# Testing the pipeline that replaces the None values in items.
self.testItem["value"] = "abc"
pipe = pipelines.RemoveNonePipeline()
processed = pipe.process_item(self.testItem, spider.FourmiSpider())
self.assertTrue(processed["value"] == "abc")
for key in self.testItem:
self.assertIsNotNone(processed[key])
if key is not "value":
self.assertIs(processed[key], "")
def test_duplicate_pipeline(self):
# Testing the pipeline that removes duplicates.
self.testItem["attribute"] = "test"
self.testItem["value"] = "test"
self.testItem["conditions"] = "test"
pipe = pipelines.DuplicatePipeline()
self.assertEqual(pipe.process_item(self.testItem, spider.FourmiSpider()), self.testItem)
self.assertRaises(DropItem, pipe.process_item, self.testItem, spider.FourmiSpider())
other_item = copy.deepcopy(self.testItem)
other_item["value"] = "test1"
self.assertEqual(pipe.process_item(other_item, spider.FourmiSpider()), other_item)
def test_attribute_selection(self):
# Testing the pipeline that selects attributes.
item1 = copy.deepcopy(self.testItem)
item2 = copy.deepcopy(self.testItem)
item1["attribute"] = "abd"
item2["attribute"] = "abc"
s = spider.FourmiSpider(selected_attributes=["a.d"])
pipe = pipelines.AttributeSelectionPipeline()
self.assertEqual(pipe.process_item(item1, s), item1)
self.assertRaises(DropItem, pipe.process_item, item2, s)

View File

@ -0,0 +1,33 @@
import unittest
from sourceloader import SourceLoader
class TestSourceloader(unittest.TestCase):
def setUp(self):
self.loader = SourceLoader()
def test_init(self):
# Test if sourceloader points to the right directory, where the sources are present.
self.assertIn("Source: Source", str(self.loader))
self.assertIn("Source: NIST", str(self.loader))
self.assertIn("Source: ChemSpider", str(self.loader))
self.assertIn("Source: WikipediaParser", str(self.loader))
def test_include(self):
# Tests for the include functionality.
self.loader.include(["So.rc.*"])
self.assertIn("Source: Source", str(self.loader))
self.assertNotIn("Source: NIST", str(self.loader))
self.assertNotIn("Source: ChemSpider", str(self.loader))
self.assertNotIn("Source: WikipediaParser", str(self.loader))
def test_exclude(self):
# Tests for the exclude functionality.
self.loader.exclude(["So.rc.*"])
self.assertNotIn("Source: Source", str(self.loader))
self.assertIn("Source: NIST", str(self.loader))
self.assertIn("Source: ChemSpider", str(self.loader))
self.assertIn("Source: WikipediaParser", str(self.loader))

61
tests/test_spider.py Normal file
View File

@ -0,0 +1,61 @@
import unittest
from scrapy.http import Request
from FourmiCrawler import spider
from FourmiCrawler.sources.ChemSpider import ChemSpider
from FourmiCrawler.sources.source import Source
class TestFoumiSpider(unittest.TestCase):
def setUp(self):
self.compound = "test_compound"
self.attributes = ["a.*", ".*a"]
self.spi = spider.FourmiSpider(self.compound, self.attributes)
def test_init(self):
# Test the initiation of the Fourmi spider
self.assertIn(self.compound, self.spi.synonyms)
for attr in self.attributes:
self.assertIn(attr, self.spi.selected_attributes)
def test_add_source(self):
# Testing the source adding function of the Fourmi spider
src = Source()
self.spi.add_source(src)
self.assertIn(src, self.spi._sources)
def test_add_sources(self):
# Testing the function that adds multiple sources
srcs = [Source(), Source(), Source()]
self.spi.add_sources(srcs)
for src in srcs:
self.assertIn(src, self.spi._sources)
def test_start_requests(self):
# A test for the function that generates the start requests
self.spi._sources = []
src = Source()
self.spi.add_source(src)
self.assertEqual(self.spi.start_requests(), [])
src2 = ChemSpider()
self.spi.add_source(src2)
self.assertIsNotNone(self.spi.start_requests())
def test_synonym_requests(self):
# A test for the synonym request function
self.spi._sources = []
src = Source()
self.spi.add_source(src)
self.assertEqual(self.spi.get_synonym_requests("new_compound"), [])
self.assertIn("new_compound", self.spi.synonyms)
src2 = ChemSpider()
self.spi.add_source(src2)
self.assertIsInstance(self.spi.get_synonym_requests("other_compound")[0], Request)
self.assertIn("other_compound", self.spi.synonyms)
self.assertEqual(self.spi.get_synonym_requests("other_compound"), [])