Archived
1
0

Merge branch 'release/0.4.1'

This commit is contained in:
Jip J. Dekker 2014-06-04 19:53:11 +02:00
commit 7fdec4bba7
16 changed files with 323 additions and 62 deletions

15
.travis.yml Normal file
View File

@ -0,0 +1,15 @@
# Config file for automatic testing at travis-ci.org
language: python
python: 2.7
# command to install dependencies, e.g. pip install -r requirements.txt --use-mirrors
install:
- pip install Scrapy docopt
# command to run tests, e.g. python setup.py test
script:
- nosetests tests
notifications:
slack: descartes2:6sgCzx3PvrO9IIMwKxj12dDM

View File

@ -1,6 +1,4 @@
# Define here the models for your scraped items # For more information on item definitions, see the Scrapy documentation in:
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html # http://doc.scrapy.org/en/latest/topics/items.html
from scrapy.item import Item, Field from scrapy.item import Item, Field

View File

@ -1,16 +1,16 @@
# Define your item pipelines here # For more information on item pipelines, see the Scrapy documentation in:
# # http://doc.scrapy.org/en/latest/topics/item-pipeline.html
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import re import re
from scrapy.exceptions import DropItem from scrapy.exceptions import DropItem
class RemoveNonePipeline(object): class RemoveNonePipeline(object):
def __init__(self): def __init__(self):
self.known_values = set() pass
def process_item(self, item, spider): @staticmethod
def process_item(item, spider):
""" """
Processing the items so None values are replaced by empty strings Processing the items so None values are replaced by empty strings
:param item: The incoming item :param item: The incoming item
@ -22,8 +22,8 @@ class RemoveNonePipeline(object):
item[key] = "" item[key] = ""
return item return item
class DuplicatePipeline(object):
class DuplicatePipeline(object):
def __init__(self): def __init__(self):
self.known_values = set() self.known_values = set()
@ -36,17 +36,18 @@ class DuplicatePipeline(object):
""" """
value = (item['attribute'], item['value'], item['conditions']) value = (item['attribute'], item['value'], item['conditions'])
if value in self.known_values: if value in self.known_values:
raise DropItem("Duplicate item found: %s" % item) # #[todo] append sources of first item. raise DropItem("Duplicate item found: %s" % item) # [todo] append sources of first item.
else: else:
self.known_values.add(value) self.known_values.add(value)
return item return item
class AttributeSelectionPipeline(object): class AttributeSelectionPipeline(object):
def __init__(self): def __init__(self):
pass; pass
def process_item(self, item, spider): @staticmethod
def process_item(item, spider):
""" """
The items are processed using the selected attribute list available in the spider, The items are processed using the selected attribute list available in the spider,
items that don't match the selected items are dropped. items that don't match the selected items are dropped.

View File

@ -3,7 +3,7 @@
# For simplicity, this file contains only the most important settings by # For simplicity, this file contains only the most important settings by
# default. All the other settings are documented here: # default. All the other settings are documented here:
# #
# http://doc.scrapy.org/en/latest/topics/settings.html # http://doc.scrapy.org/en/latest/topics/settings.html
# #
BOT_NAME = 'FourmiCrawler' BOT_NAME = 'FourmiCrawler'

View File

@ -1,9 +1,12 @@
from source import Source import re
from scrapy import log from scrapy import log
from scrapy.http import Request from scrapy.http import Request
from scrapy.selector import Selector from scrapy.selector import Selector
from source import Source
from FourmiCrawler.items import Result from FourmiCrawler.items import Result
import re
# [TODO] - Maybe clean up usage of '.extract()[0]', because of possible IndexError exception. # [TODO] - Maybe clean up usage of '.extract()[0]', because of possible IndexError exception.
@ -58,9 +61,7 @@ class ChemSpider(Source):
prop_conditions = '' prop_conditions = ''
# Test for properties without values, with one hardcoded exception # Test for properties without values, with one hardcoded exception
if (not re.match(r'^\d', prop_value) or if not re.match(r'^\d', prop_value) or (prop_name == 'Polarizability' and prop_value == '10-24cm3'):
(prop_name == 'Polarizability' and
prop_value == '10-24cm3')):
continue continue
# Match for condition in parentheses # Match for condition in parentheses

View File

@ -1,13 +1,16 @@
from source import Source import re
from scrapy import log from scrapy import log
from scrapy.http import Request from scrapy.http import Request
from scrapy.selector import Selector from scrapy.selector import Selector
from source import Source
from FourmiCrawler.items import Result from FourmiCrawler.items import Result
import re
# [TODO]: values can be '128.', perhaps remove the dot in that case? # [TODO]: values can be '128.', perhaps remove the dot in that case?
# [TODO]: properties have references and comments which do not exist in the # [TODO]: properties have references and comments which do not exist in the
# Result item, but should be included eventually. # Result item, but should be included eventually.
class NIST(Source): class NIST(Source):
"""NIST Scraper plugin """NIST Scraper plugin
@ -75,7 +78,7 @@ class NIST(Source):
requests.extend(self.parse_generic_data(table, summary)) requests.extend(self.parse_generic_data(table, summary))
else: else:
log.msg('NIST table: NOT SUPPORTED', level=log.WARNING) log.msg('NIST table: NOT SUPPORTED', level=log.WARNING)
continue #Assume unsupported continue # Assume unsupported
return requests return requests
def parse_generic_info(self, sel): def parse_generic_info(self, sel):
@ -103,7 +106,7 @@ class NIST(Source):
data['IUPAC Standard InChI'] = raw_inchi.extract()[0] data['IUPAC Standard InChI'] = raw_inchi.extract()[0]
raw_inchikey = ul.xpath('li[strong="IUPAC Standard InChIKey:"]' raw_inchikey = ul.xpath('li[strong="IUPAC Standard InChIKey:"]'
'/tt/text()') '/tt/text()')
data['IUPAC Standard InChIKey'] = raw_inchikey.extract()[0] data['IUPAC Standard InChIKey'] = raw_inchikey.extract()[0]
raw_cas_number = ul.xpath('li[strong="CAS Registry Number:"]/text()') raw_cas_number = ul.xpath('li[strong="CAS Registry Number:"]/text()')
@ -129,10 +132,10 @@ class NIST(Source):
results = [] results = []
for tr in table.xpath('tr[td]'): for tr in table.xpath('tr[td]'):
extra_data_url = tr.xpath('td[last()][a="Individual data points"]' extra_data_url = tr.xpath('td[last()][a="Individual data points"]'
'/a/@href').extract() '/a/@href').extract()
if extra_data_url: if extra_data_url:
request = Request(url=self.website[:-1] + extra_data_url[0], request = Request(url=self.website[:-1] + extra_data_url[0],
callback=self.parse_individual_datapoints) callback=self.parse_individual_datapoints)
results.append(request) results.append(request)
continue continue
data = [] data = []
@ -180,7 +183,6 @@ class NIST(Source):
}) })
results.append(result) results.append(result)
return results return results
@staticmethod @staticmethod
@ -228,7 +230,8 @@ class NIST(Source):
return results return results
def parse_individual_datapoints(self, response): @staticmethod
def parse_individual_datapoints(response):
"""Parses the page linked from aggregate data""" """Parses the page linked from aggregate data"""
sel = Selector(response) sel = Selector(response)
table = sel.xpath('//table[@class="data"]')[0] table = sel.xpath('//table[@class="data"]')[0]

View File

@ -1,9 +1,11 @@
import re
from scrapy.http import Request from scrapy.http import Request
from scrapy import log from scrapy import log
from source import Source
from scrapy.selector import Selector from scrapy.selector import Selector
from source import Source
from FourmiCrawler.items import Result from FourmiCrawler.items import Result
import re
class WikipediaParser(Source): class WikipediaParser(Source):
@ -36,7 +38,7 @@ class WikipediaParser(Source):
""" scrape data from infobox on wikipedia. """ """ scrape data from infobox on wikipedia. """
items = [] items = []
#be sure to get chembox (wikipedia template) # be sure to get chembox (wikipedia template)
tr_list = sel.xpath('.//table[@class="infobox bordered"]//td[not(@colspan)]'). \ tr_list = sel.xpath('.//table[@class="infobox bordered"]//td[not(@colspan)]'). \
xpath('normalize-space(string())') xpath('normalize-space(string())')
prop_names = tr_list[::2] prop_names = tr_list[::2]

View File

@ -7,15 +7,32 @@ class Source:
_spider = None _spider = None
def __init__(self): def __init__(self):
"""
Initiation of a new Source
"""
pass pass
def parse(self, reponse): def parse(self, response):
log.msg("The parse function of the empty parser was used.", level=log.WARNING) """
This function should be able to parse all Scrapy Response objects with a URL matching the website Regex.
:param response: A Scrapy Response object
:return: A list of Result items and new Scrapy Requests
"""
log.msg("The parse function of the empty source was used.", level=log.WARNING)
pass pass
def new_compound_request(self, compound): def new_compound_request(self, compound):
"""
This function should return a Scrapy Request for the given compound request.
:param compound: A compound name.
:return: A new Scrapy Request
"""
# return Request(url=self.website[:-1] + compound, callback=self.parse) # return Request(url=self.website[:-1] + compound, callback=self.parse)
pass pass
def set_spider(self, spider): def set_spider(self, spider):
"""
A Function to save the associated spider.
:param spider: A FourmiSpider object
"""
self._spider = spider self._spider = spider

View File

@ -1,43 +1,77 @@
import re
from scrapy.spider import Spider from scrapy.spider import Spider
from scrapy import log from scrapy import log
import re
class FourmiSpider(Spider): class FourmiSpider(Spider):
"""
A spider writen for the Fourmi Project which calls upon all available sources to request and scrape data.
"""
name = "FourmiSpider" name = "FourmiSpider"
__parsers = [] _sources = []
synonyms = [] synonyms = set()
def __init__(self, compound=None, selected_attributes=[".*"], *args, **kwargs): def __init__(self, compound=None, selected_attributes=[".*"], *args, **kwargs):
"""
Initiation of the Spider
:param compound: compound that will be searched.
:param selected_attributes: A list of regular expressions that the attributes should match.
"""
super(FourmiSpider, self).__init__(*args, **kwargs) super(FourmiSpider, self).__init__(*args, **kwargs)
self.synonyms.append(compound) self.synonyms.add(compound)
self.selected_attributes = selected_attributes; self.selected_attributes = selected_attributes
def parse(self, reponse): def parse(self, response):
for parser in self.__parsers: """
if re.match(parser.website, reponse.url): The function that is called when a response to a request is available. This function distributes this to a
log.msg("Url: " + reponse.url + " -> Source: " + parser.website, level=log.DEBUG) source which should be able to handle parsing the data.
return parser.parse(reponse) :param response: A Scrapy Response object that should be parsed
:return: A list of Result items and new Request to be handled by the scrapy core.
"""
for source in self._sources:
if re.match(source.website, response.url):
log.msg("Url: " + response.url + " -> Source: " + source.website, level=log.DEBUG)
return source.parse(response)
return None return None
def get_synonym_requests(self, compound): def get_synonym_requests(self, compound):
"""
A function that generates new Scrapy Request for each source given a new synonym of a compound.
:param compound: A compound name
:return: A list of Scrapy Request objects
"""
requests = [] requests = []
for parser in self.__parsers: if compound not in self.synonyms:
parser_requests = parser.new_compound_request(compound) self.synonyms.add(compound)
if parser_requests is not None: for parser in self._sources:
requests.append(parser_requests) parser_requests = parser.new_compound_request(compound)
if parser_requests is not None:
requests.append(parser_requests)
return requests return requests
def start_requests(self): def start_requests(self):
"""
The function called by Scrapy for it's first Requests
:return: A list of Scrapy Request generated from the known synonyms using the available sources.
"""
requests = [] requests = []
for synonym in self.synonyms: for synonym in self.synonyms:
requests.extend(self.get_synonym_requests(synonym)) requests.extend(self.get_synonym_requests(synonym))
return requests return requests
def add_parsers(self, parsers): def add_sources(self, sources):
for parser in parsers: """
self.add_parser(parser) A function to add a new Parser objects to the list of available sources.
:param sources: A list of Source Objects.
"""
for parser in sources:
self.add_source(parser)
def add_parser(self, parser): def add_source(self, source):
self.__parsers.append(parser) """
parser.set_spider(self) A function add a new Parser object to the list of available parsers.
:param source: A Source Object
"""
self._sources.append(source)
source.set_spider(self)

View File

@ -1,5 +1,9 @@
# Fourmi # Fourmi
**Master branch**: [![Build Status](https://travis-ci.org/Recondor/Fourmi.svg?branch=master)](https://travis-ci.org/Recondor/Fourmi)
**Developing branch**: [![Build Status](https://travis-ci.org/Recondor/Fourmi.svg?branch=develop)](https://travis-ci.org/Recondor/Fourmi)
Fourmi is an web scraper for chemical substances. The program is designed to be Fourmi is an web scraper for chemical substances. The program is designed to be
used as a search engine to search multiple chemical databases for a specific used as a search engine to search multiple chemical databases for a specific
substance. The program will produce all available attributes of the substance substance. The program will produce all available attributes of the substance

View File

@ -1,4 +1,4 @@
#!/usr/bin/env python # !/usr/bin/env python
""" """
Fourmi, a web scraper build to search specific information for a given compound (and it's pseudonyms). Fourmi, a web scraper build to search specific information for a given compound (and it's pseudonyms).
@ -33,9 +33,16 @@ from FourmiCrawler.spider import FourmiSpider
from sourceloader import SourceLoader from sourceloader import SourceLoader
def setup_crawler(searchable, settings, source_loader, attributes): def setup_crawler(compound, settings, source_loader, attributes):
spider = FourmiSpider(compound=searchable, selected_attributes=attributes) """
spider.add_parsers(source_loader.sources) This function prepares and start the crawler which starts the actual search on the internet
:param compound: The compound which should be searched
:param settings: A scrapy settings object
:param source_loader: A fully functional SourceLoader object which contains only the sources that should be used.
:param attributes: A list of regular expressions which the attribute names should match.
"""
spider = FourmiSpider(compound=compound, selected_attributes=attributes)
spider.add_sources(source_loader.sources)
crawler = Crawler(settings) crawler = Crawler(settings)
crawler.signals.connect(reactor.stop, signal=signals.spider_closed) crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
crawler.configure() crawler.configure()
@ -44,8 +51,13 @@ def setup_crawler(searchable, settings, source_loader, attributes):
def scrapy_settings_manipulation(docopt_arguments): def scrapy_settings_manipulation(docopt_arguments):
"""
This function manipulates the Scrapy settings that normally would be set in the settings file. In the Fourmi
project these are command line arguments.
:param docopt_arguments: A dictionary generated by docopt containing all CLI arguments.
"""
settings = get_project_settings() settings = get_project_settings()
# [todo] - add at least a warning for files that already exist
if docopt_arguments["--output"] != 'result.*format*': if docopt_arguments["--output"] != 'result.*format*':
settings.overrides["FEED_URI"] = docopt_arguments["--output"] settings.overrides["FEED_URI"] = docopt_arguments["--output"]
elif docopt_arguments["--format"] == "jsonlines": elif docopt_arguments["--format"] == "jsonlines":
@ -60,6 +72,10 @@ def scrapy_settings_manipulation(docopt_arguments):
def start_log(docopt_arguments): def start_log(docopt_arguments):
"""
This function starts the logging functionality of Scrapy using the settings given by the CLI.
:param docopt_arguments: A dictionary generated by docopt containing all CLI arguments.
"""
if docopt_arguments["--log"] is not None: if docopt_arguments["--log"] is not None:
if docopt_arguments["--verbose"]: if docopt_arguments["--verbose"]:
log.start(logfile=docopt_arguments["--log"], logstdout=False, loglevel=log.DEBUG) log.start(logfile=docopt_arguments["--log"], logstdout=False, loglevel=log.DEBUG)
@ -73,14 +89,20 @@ def start_log(docopt_arguments):
def search(docopt_arguments, source_loader): def search(docopt_arguments, source_loader):
"""
The function that facilitates the search for a specific compound.
:param docopt_arguments: A dictionary generated by docopt containing all CLI arguments.
:param source_loader: An initiated SourceLoader object pointed at the directory with the sources.
"""
start_log(docopt_arguments) start_log(docopt_arguments)
settings = scrapy_settings_manipulation(docopt_arguments) settings = scrapy_settings_manipulation(docopt_arguments)
setup_crawler(docopt_arguments["<compound>"], settings, source_loader, docopt_arguments["--attributes"].split(',')) setup_crawler(docopt_arguments["<compound>"], settings, source_loader, docopt_arguments["--attributes"].split(','))
reactor.run() reactor.run()
# The start for the Fourmi Command Line interface.
if __name__ == '__main__': if __name__ == '__main__':
arguments = docopt.docopt(__doc__, version='Fourmi - V0.4.0') arguments = docopt.docopt(__doc__, version='Fourmi - V0.4.1')
loader = SourceLoader() loader = SourceLoader()
if arguments["--include"]: if arguments["--include"]:

View File

@ -1,6 +1,7 @@
import inspect import inspect
import os import os
import re import re
from FourmiCrawler.sources.source import Source from FourmiCrawler.sources.source import Source
@ -8,6 +9,10 @@ class SourceLoader:
sources = [] sources = []
def __init__(self, rel_dir="FourmiCrawler/sources"): def __init__(self, rel_dir="FourmiCrawler/sources"):
"""
The initiation of a SourceLoader, selects and indexes a directory for usable sources.
:param rel_dir: A relative path to a directory.
"""
path = os.path.dirname(os.path.abspath(__file__)) path = os.path.dirname(os.path.abspath(__file__))
path += "/" + rel_dir path += "/" + rel_dir
known_parser = set() known_parser = set()
@ -21,18 +26,30 @@ class SourceLoader:
known_parser.add(cls) known_parser.add(cls)
def include(self, source_names): def include(self, source_names):
"""
This function excludes all sources that don't match the given regular expressions.
:param source_names: A list of regular expression (strings)
"""
new = set() new = set()
for name in source_names: for name in source_names:
new.update([src for src in self.sources if re.match(name, src.__class__.__name__)]) new.update([src for src in self.sources if re.match(name, src.__class__.__name__)])
self.sources = list(new) self.sources = list(new)
def exclude(self, source_names): def exclude(self, source_names):
"""
This function excludes all sources that match the given regular expressions.
:param source_names: A list of regular expression (strings)
"""
exclude = [] exclude = []
for name in source_names: for name in source_names:
exclude.extend([src for src in self.sources if re.match(name, src.__class__.__name__)]) exclude.extend([src for src in self.sources if re.match(name, src.__class__.__name__)])
self.sources = [src for src in self.sources if src not in exclude] self.sources = [src for src in self.sources if src not in exclude]
def __str__(self): def __str__(self):
"""
This function returns a string with all sources currently available in the SourceLoader.
:return: a string with all available sources.
"""
string = "" string = ""
for src in self.sources: for src in self.sources:
string += "Source: " + src.__class__.__name__ string += "Source: " + src.__class__.__name__

1
tests/__init__.py Normal file
View File

@ -0,0 +1 @@

52
tests/test_pipeline.py Normal file
View File

@ -0,0 +1,52 @@
import copy
import unittest
from scrapy.exceptions import DropItem
from FourmiCrawler import pipelines, spider, items
class TestPipelines(unittest.TestCase):
def setUp(self):
self.testItem = items.Result()
def test_none_pipeline(self):
# Testing the pipeline that replaces the None values in items.
self.testItem["value"] = "abc"
pipe = pipelines.RemoveNonePipeline()
processed = pipe.process_item(self.testItem, spider.FourmiSpider())
self.assertTrue(processed["value"] == "abc")
for key in self.testItem:
self.assertIsNotNone(processed[key])
if key is not "value":
self.assertIs(processed[key], "")
def test_duplicate_pipeline(self):
# Testing the pipeline that removes duplicates.
self.testItem["attribute"] = "test"
self.testItem["value"] = "test"
self.testItem["conditions"] = "test"
pipe = pipelines.DuplicatePipeline()
self.assertEqual(pipe.process_item(self.testItem, spider.FourmiSpider()), self.testItem)
self.assertRaises(DropItem, pipe.process_item, self.testItem, spider.FourmiSpider())
other_item = copy.deepcopy(self.testItem)
other_item["value"] = "test1"
self.assertEqual(pipe.process_item(other_item, spider.FourmiSpider()), other_item)
def test_attribute_selection(self):
# Testing the pipeline that selects attributes.
item1 = copy.deepcopy(self.testItem)
item2 = copy.deepcopy(self.testItem)
item1["attribute"] = "abd"
item2["attribute"] = "abc"
s = spider.FourmiSpider(selected_attributes=["a.d"])
pipe = pipelines.AttributeSelectionPipeline()
self.assertEqual(pipe.process_item(item1, s), item1)
self.assertRaises(DropItem, pipe.process_item, item2, s)

View File

@ -0,0 +1,33 @@
import unittest
from sourceloader import SourceLoader
class TestSourceloader(unittest.TestCase):
def setUp(self):
self.loader = SourceLoader()
def test_init(self):
# Test if sourceloader points to the right directory, where the sources are present.
self.assertIn("Source: Source", str(self.loader))
self.assertIn("Source: NIST", str(self.loader))
self.assertIn("Source: ChemSpider", str(self.loader))
self.assertIn("Source: WikipediaParser", str(self.loader))
def test_include(self):
# Tests for the include functionality.
self.loader.include(["So.rc.*"])
self.assertIn("Source: Source", str(self.loader))
self.assertNotIn("Source: NIST", str(self.loader))
self.assertNotIn("Source: ChemSpider", str(self.loader))
self.assertNotIn("Source: WikipediaParser", str(self.loader))
def test_exclude(self):
# Tests for the exclude functionality.
self.loader.exclude(["So.rc.*"])
self.assertNotIn("Source: Source", str(self.loader))
self.assertIn("Source: NIST", str(self.loader))
self.assertIn("Source: ChemSpider", str(self.loader))
self.assertIn("Source: WikipediaParser", str(self.loader))

61
tests/test_spider.py Normal file
View File

@ -0,0 +1,61 @@
import unittest
from scrapy.http import Request
from FourmiCrawler import spider
from FourmiCrawler.sources.ChemSpider import ChemSpider
from FourmiCrawler.sources.source import Source
class TestFoumiSpider(unittest.TestCase):
def setUp(self):
self.compound = "test_compound"
self.attributes = ["a.*", ".*a"]
self.spi = spider.FourmiSpider(self.compound, self.attributes)
def test_init(self):
# Test the initiation of the Fourmi spider
self.assertIn(self.compound, self.spi.synonyms)
for attr in self.attributes:
self.assertIn(attr, self.spi.selected_attributes)
def test_add_source(self):
# Testing the source adding function of the Fourmi spider
src = Source()
self.spi.add_source(src)
self.assertIn(src, self.spi._sources)
def test_add_sources(self):
# Testing the function that adds multiple sources
srcs = [Source(), Source(), Source()]
self.spi.add_sources(srcs)
for src in srcs:
self.assertIn(src, self.spi._sources)
def test_start_requests(self):
# A test for the function that generates the start requests
self.spi._sources = []
src = Source()
self.spi.add_source(src)
self.assertEqual(self.spi.start_requests(), [])
src2 = ChemSpider()
self.spi.add_source(src2)
self.assertIsNotNone(self.spi.start_requests())
def test_synonym_requests(self):
# A test for the synonym request function
self.spi._sources = []
src = Source()
self.spi.add_source(src)
self.assertEqual(self.spi.get_synonym_requests("new_compound"), [])
self.assertIn("new_compound", self.spi.synonyms)
src2 = ChemSpider()
self.spi.add_source(src2)
self.assertIsInstance(self.spi.get_synonym_requests("other_compound")[0], Request)
self.assertIn("other_compound", self.spi.synonyms)
self.assertEqual(self.spi.get_synonym_requests("other_compound"), [])