Merge branch 'feature/GUI' of https://github.com/Recondor/Fourmi into feature/GUI
This commit is contained in:
commit
bd12216ad4
15
.travis.yml
Normal file
15
.travis.yml
Normal file
@ -0,0 +1,15 @@
|
||||
# Config file for automatic testing at travis-ci.org
|
||||
|
||||
language: python
|
||||
python: 2.7
|
||||
|
||||
# command to install dependencies, e.g. pip install -r requirements.txt --use-mirrors
|
||||
install:
|
||||
- pip install Scrapy docopt
|
||||
|
||||
# command to run tests, e.g. python setup.py test
|
||||
script:
|
||||
- nosetests tests
|
||||
|
||||
notifications:
|
||||
slack: descartes2:6sgCzx3PvrO9IIMwKxj12dDM
|
@ -1,6 +1,4 @@
|
||||
# Define here the models for your scraped items
|
||||
#
|
||||
# See documentation in:
|
||||
# For more information on item definitions, see the Scrapy documentation in:
|
||||
# http://doc.scrapy.org/en/latest/topics/items.html
|
||||
|
||||
from scrapy.item import Item, Field
|
||||
|
@ -1,16 +1,16 @@
|
||||
# Define your item pipelines here
|
||||
#
|
||||
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
|
||||
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
|
||||
# For more information on item pipelines, see the Scrapy documentation in:
|
||||
# http://doc.scrapy.org/en/latest/topics/item-pipeline.html
|
||||
import re
|
||||
|
||||
from scrapy.exceptions import DropItem
|
||||
|
||||
|
||||
class RemoveNonePipeline(object):
|
||||
|
||||
def __init__(self):
|
||||
self.known_values = set()
|
||||
pass
|
||||
|
||||
def process_item(self, item, spider):
|
||||
@staticmethod
|
||||
def process_item(item, spider):
|
||||
"""
|
||||
Processing the items so None values are replaced by empty strings
|
||||
:param item: The incoming item
|
||||
@ -22,8 +22,8 @@ class RemoveNonePipeline(object):
|
||||
item[key] = ""
|
||||
return item
|
||||
|
||||
class DuplicatePipeline(object):
|
||||
|
||||
class DuplicatePipeline(object):
|
||||
def __init__(self):
|
||||
self.known_values = set()
|
||||
|
||||
@ -36,17 +36,18 @@ class DuplicatePipeline(object):
|
||||
"""
|
||||
value = (item['attribute'], item['value'], item['conditions'])
|
||||
if value in self.known_values:
|
||||
raise DropItem("Duplicate item found: %s" % item) # #[todo] append sources of first item.
|
||||
raise DropItem("Duplicate item found: %s" % item) # [todo] append sources of first item.
|
||||
else:
|
||||
self.known_values.add(value)
|
||||
return item
|
||||
|
||||
|
||||
class AttributeSelectionPipeline(object):
|
||||
|
||||
def __init__(self):
|
||||
pass;
|
||||
pass
|
||||
|
||||
def process_item(self, item, spider):
|
||||
@staticmethod
|
||||
def process_item(item, spider):
|
||||
"""
|
||||
The items are processed using the selected attribute list available in the spider,
|
||||
items that don't match the selected items are dropped.
|
||||
|
@ -3,7 +3,7 @@
|
||||
# For simplicity, this file contains only the most important settings by
|
||||
# default. All the other settings are documented here:
|
||||
#
|
||||
# http://doc.scrapy.org/en/latest/topics/settings.html
|
||||
# http://doc.scrapy.org/en/latest/topics/settings.html
|
||||
#
|
||||
|
||||
BOT_NAME = 'FourmiCrawler'
|
||||
|
@ -1,9 +1,12 @@
|
||||
from source import Source
|
||||
import re
|
||||
|
||||
from scrapy import log
|
||||
from scrapy.http import Request
|
||||
from scrapy.selector import Selector
|
||||
|
||||
from source import Source
|
||||
from FourmiCrawler.items import Result
|
||||
import re
|
||||
|
||||
|
||||
# [TODO] - Maybe clean up usage of '.extract()[0]', because of possible IndexError exception.
|
||||
|
||||
@ -58,9 +61,7 @@ class ChemSpider(Source):
|
||||
prop_conditions = ''
|
||||
|
||||
# Test for properties without values, with one hardcoded exception
|
||||
if (not re.match(r'^\d', prop_value) or
|
||||
(prop_name == 'Polarizability' and
|
||||
prop_value == '10-24cm3')):
|
||||
if not re.match(r'^\d', prop_value) or (prop_name == 'Polarizability' and prop_value == '10-24cm3'):
|
||||
continue
|
||||
|
||||
# Match for condition in parentheses
|
||||
|
@ -1,13 +1,16 @@
|
||||
from source import Source
|
||||
import re
|
||||
|
||||
from scrapy import log
|
||||
from scrapy.http import Request
|
||||
from scrapy.selector import Selector
|
||||
|
||||
from source import Source
|
||||
from FourmiCrawler.items import Result
|
||||
import re
|
||||
|
||||
|
||||
# [TODO]: values can be '128.', perhaps remove the dot in that case?
|
||||
# [TODO]: properties have references and comments which do not exist in the
|
||||
# Result item, but should be included eventually.
|
||||
# Result item, but should be included eventually.
|
||||
|
||||
class NIST(Source):
|
||||
"""NIST Scraper plugin
|
||||
@ -15,7 +18,7 @@ class NIST(Source):
|
||||
This plugin manages searching for a chemical on the NIST website
|
||||
and parsing the resulting page if the chemical exists on NIST.
|
||||
"""
|
||||
website = "http://webbook.nist.gov/*"
|
||||
website = "http://webbook.nist.gov/*"
|
||||
|
||||
search = 'cgi/cbook.cgi?Name=%s&Units=SI&cTP=on'
|
||||
|
||||
@ -75,7 +78,7 @@ class NIST(Source):
|
||||
requests.extend(self.parse_generic_data(table, summary))
|
||||
else:
|
||||
log.msg('NIST table: NOT SUPPORTED', level=log.WARNING)
|
||||
continue #Assume unsupported
|
||||
continue # Assume unsupported
|
||||
return requests
|
||||
|
||||
def parse_generic_info(self, sel):
|
||||
@ -103,7 +106,7 @@ class NIST(Source):
|
||||
data['IUPAC Standard InChI'] = raw_inchi.extract()[0]
|
||||
|
||||
raw_inchikey = ul.xpath('li[strong="IUPAC Standard InChIKey:"]'
|
||||
'/tt/text()')
|
||||
'/tt/text()')
|
||||
data['IUPAC Standard InChIKey'] = raw_inchikey.extract()[0]
|
||||
|
||||
raw_cas_number = ul.xpath('li[strong="CAS Registry Number:"]/text()')
|
||||
@ -129,10 +132,10 @@ class NIST(Source):
|
||||
results = []
|
||||
for tr in table.xpath('tr[td]'):
|
||||
extra_data_url = tr.xpath('td[last()][a="Individual data points"]'
|
||||
'/a/@href').extract()
|
||||
'/a/@href').extract()
|
||||
if extra_data_url:
|
||||
request = Request(url=self.website[:-1] + extra_data_url[0],
|
||||
callback=self.parse_individual_datapoints)
|
||||
callback=self.parse_individual_datapoints)
|
||||
results.append(request)
|
||||
continue
|
||||
data = []
|
||||
@ -180,7 +183,6 @@ class NIST(Source):
|
||||
})
|
||||
results.append(result)
|
||||
|
||||
|
||||
return results
|
||||
|
||||
@staticmethod
|
||||
@ -228,7 +230,8 @@ class NIST(Source):
|
||||
|
||||
return results
|
||||
|
||||
def parse_individual_datapoints(self, response):
|
||||
@staticmethod
|
||||
def parse_individual_datapoints(response):
|
||||
"""Parses the page linked from aggregate data"""
|
||||
sel = Selector(response)
|
||||
table = sel.xpath('//table[@class="data"]')[0]
|
||||
|
@ -1,9 +1,11 @@
|
||||
import re
|
||||
|
||||
from scrapy.http import Request
|
||||
from scrapy import log
|
||||
from source import Source
|
||||
from scrapy.selector import Selector
|
||||
|
||||
from source import Source
|
||||
from FourmiCrawler.items import Result
|
||||
import re
|
||||
|
||||
|
||||
class WikipediaParser(Source):
|
||||
@ -36,7 +38,7 @@ class WikipediaParser(Source):
|
||||
""" scrape data from infobox on wikipedia. """
|
||||
items = []
|
||||
|
||||
#be sure to get chembox (wikipedia template)
|
||||
# be sure to get chembox (wikipedia template)
|
||||
tr_list = sel.xpath('.//table[@class="infobox bordered"]//td[not(@colspan)]'). \
|
||||
xpath('normalize-space(string())')
|
||||
prop_names = tr_list[::2]
|
||||
|
@ -7,15 +7,32 @@ class Source:
|
||||
_spider = None
|
||||
|
||||
def __init__(self):
|
||||
"""
|
||||
Initiation of a new Source
|
||||
"""
|
||||
pass
|
||||
|
||||
def parse(self, reponse):
|
||||
log.msg("The parse function of the empty parser was used.", level=log.WARNING)
|
||||
def parse(self, response):
|
||||
"""
|
||||
This function should be able to parse all Scrapy Response objects with a URL matching the website Regex.
|
||||
:param response: A Scrapy Response object
|
||||
:return: A list of Result items and new Scrapy Requests
|
||||
"""
|
||||
log.msg("The parse function of the empty source was used.", level=log.WARNING)
|
||||
pass
|
||||
|
||||
def new_compound_request(self, compound):
|
||||
"""
|
||||
This function should return a Scrapy Request for the given compound request.
|
||||
:param compound: A compound name.
|
||||
:return: A new Scrapy Request
|
||||
"""
|
||||
# return Request(url=self.website[:-1] + compound, callback=self.parse)
|
||||
pass
|
||||
|
||||
def set_spider(self, spider):
|
||||
"""
|
||||
A Function to save the associated spider.
|
||||
:param spider: A FourmiSpider object
|
||||
"""
|
||||
self._spider = spider
|
||||
|
@ -1,43 +1,77 @@
|
||||
import re
|
||||
|
||||
from scrapy.spider import Spider
|
||||
from scrapy import log
|
||||
import re
|
||||
|
||||
|
||||
class FourmiSpider(Spider):
|
||||
"""
|
||||
A spider writen for the Fourmi Project which calls upon all available sources to request and scrape data.
|
||||
"""
|
||||
name = "FourmiSpider"
|
||||
__parsers = []
|
||||
synonyms = []
|
||||
_sources = []
|
||||
synonyms = set()
|
||||
|
||||
def __init__(self, compound=None, selected_attributes=[".*"], *args, **kwargs):
|
||||
"""
|
||||
Initiation of the Spider
|
||||
:param compound: compound that will be searched.
|
||||
:param selected_attributes: A list of regular expressions that the attributes should match.
|
||||
"""
|
||||
super(FourmiSpider, self).__init__(*args, **kwargs)
|
||||
self.synonyms.append(compound)
|
||||
self.selected_attributes = selected_attributes;
|
||||
self.synonyms.add(compound)
|
||||
self.selected_attributes = selected_attributes
|
||||
|
||||
def parse(self, reponse):
|
||||
for parser in self.__parsers:
|
||||
if re.match(parser.website, reponse.url):
|
||||
log.msg("Url: " + reponse.url + " -> Source: " + parser.website, level=log.DEBUG)
|
||||
return parser.parse(reponse)
|
||||
def parse(self, response):
|
||||
"""
|
||||
The function that is called when a response to a request is available. This function distributes this to a
|
||||
source which should be able to handle parsing the data.
|
||||
:param response: A Scrapy Response object that should be parsed
|
||||
:return: A list of Result items and new Request to be handled by the scrapy core.
|
||||
"""
|
||||
for source in self._sources:
|
||||
if re.match(source.website, response.url):
|
||||
log.msg("Url: " + response.url + " -> Source: " + source.website, level=log.DEBUG)
|
||||
return source.parse(response)
|
||||
return None
|
||||
|
||||
def get_synonym_requests(self, compound):
|
||||
"""
|
||||
A function that generates new Scrapy Request for each source given a new synonym of a compound.
|
||||
:param compound: A compound name
|
||||
:return: A list of Scrapy Request objects
|
||||
"""
|
||||
requests = []
|
||||
for parser in self.__parsers:
|
||||
parser_requests = parser.new_compound_request(compound)
|
||||
if parser_requests is not None:
|
||||
requests.append(parser_requests)
|
||||
if compound not in self.synonyms:
|
||||
self.synonyms.add(compound)
|
||||
for parser in self._sources:
|
||||
parser_requests = parser.new_compound_request(compound)
|
||||
if parser_requests is not None:
|
||||
requests.append(parser_requests)
|
||||
return requests
|
||||
|
||||
def start_requests(self):
|
||||
"""
|
||||
The function called by Scrapy for it's first Requests
|
||||
:return: A list of Scrapy Request generated from the known synonyms using the available sources.
|
||||
"""
|
||||
requests = []
|
||||
for synonym in self.synonyms:
|
||||
requests.extend(self.get_synonym_requests(synonym))
|
||||
return requests
|
||||
|
||||
def add_parsers(self, parsers):
|
||||
for parser in parsers:
|
||||
self.add_parser(parser)
|
||||
def add_sources(self, sources):
|
||||
"""
|
||||
A function to add a new Parser objects to the list of available sources.
|
||||
:param sources: A list of Source Objects.
|
||||
"""
|
||||
for parser in sources:
|
||||
self.add_source(parser)
|
||||
|
||||
def add_parser(self, parser):
|
||||
self.__parsers.append(parser)
|
||||
parser.set_spider(self)
|
||||
def add_source(self, source):
|
||||
"""
|
||||
A function add a new Parser object to the list of available parsers.
|
||||
:param source: A Source Object
|
||||
"""
|
||||
self._sources.append(source)
|
||||
source.set_spider(self)
|
@ -1,5 +1,9 @@
|
||||
# Fourmi
|
||||
|
||||
**Master branch**: [](https://travis-ci.org/Recondor/Fourmi)
|
||||
|
||||
**Developing branch**: [](https://travis-ci.org/Recondor/Fourmi)
|
||||
|
||||
Fourmi is an web scraper for chemical substances. The program is designed to be
|
||||
used as a search engine to search multiple chemical databases for a specific
|
||||
substance. The program will produce all available attributes of the substance
|
||||
|
34
fourmi.py
34
fourmi.py
@ -1,4 +1,4 @@
|
||||
#!/usr/bin/env python
|
||||
# !/usr/bin/env python
|
||||
"""
|
||||
Fourmi, a web scraper build to search specific information for a given compound (and it's pseudonyms).
|
||||
|
||||
@ -33,9 +33,16 @@ from FourmiCrawler.spider import FourmiSpider
|
||||
from sourceloader import SourceLoader
|
||||
|
||||
|
||||
def setup_crawler(searchable, settings, source_loader, attributes):
|
||||
spider = FourmiSpider(compound=searchable, selected_attributes=attributes)
|
||||
spider.add_parsers(source_loader.sources)
|
||||
def setup_crawler(compound, settings, source_loader, attributes):
|
||||
"""
|
||||
This function prepares and start the crawler which starts the actual search on the internet
|
||||
:param compound: The compound which should be searched
|
||||
:param settings: A scrapy settings object
|
||||
:param source_loader: A fully functional SourceLoader object which contains only the sources that should be used.
|
||||
:param attributes: A list of regular expressions which the attribute names should match.
|
||||
"""
|
||||
spider = FourmiSpider(compound=compound, selected_attributes=attributes)
|
||||
spider.add_sources(source_loader.sources)
|
||||
crawler = Crawler(settings)
|
||||
crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
|
||||
crawler.configure()
|
||||
@ -44,8 +51,13 @@ def setup_crawler(searchable, settings, source_loader, attributes):
|
||||
|
||||
|
||||
def scrapy_settings_manipulation(docopt_arguments):
|
||||
"""
|
||||
This function manipulates the Scrapy settings that normally would be set in the settings file. In the Fourmi
|
||||
project these are command line arguments.
|
||||
:param docopt_arguments: A dictionary generated by docopt containing all CLI arguments.
|
||||
"""
|
||||
settings = get_project_settings()
|
||||
# [todo] - add at least a warning for files that already exist
|
||||
|
||||
if docopt_arguments["--output"] != 'result.*format*':
|
||||
settings.overrides["FEED_URI"] = docopt_arguments["--output"]
|
||||
elif docopt_arguments["--format"] == "jsonlines":
|
||||
@ -60,6 +72,10 @@ def scrapy_settings_manipulation(docopt_arguments):
|
||||
|
||||
|
||||
def start_log(docopt_arguments):
|
||||
"""
|
||||
This function starts the logging functionality of Scrapy using the settings given by the CLI.
|
||||
:param docopt_arguments: A dictionary generated by docopt containing all CLI arguments.
|
||||
"""
|
||||
if docopt_arguments["--log"] is not None:
|
||||
if docopt_arguments["--verbose"]:
|
||||
log.start(logfile=docopt_arguments["--log"], logstdout=False, loglevel=log.DEBUG)
|
||||
@ -73,14 +89,20 @@ def start_log(docopt_arguments):
|
||||
|
||||
|
||||
def search(docopt_arguments, source_loader):
|
||||
"""
|
||||
The function that facilitates the search for a specific compound.
|
||||
:param docopt_arguments: A dictionary generated by docopt containing all CLI arguments.
|
||||
:param source_loader: An initiated SourceLoader object pointed at the directory with the sources.
|
||||
"""
|
||||
start_log(docopt_arguments)
|
||||
settings = scrapy_settings_manipulation(docopt_arguments)
|
||||
setup_crawler(docopt_arguments["<compound>"], settings, source_loader, docopt_arguments["--attributes"].split(','))
|
||||
reactor.run()
|
||||
|
||||
|
||||
# The start for the Fourmi Command Line interface.
|
||||
if __name__ == '__main__':
|
||||
arguments = docopt.docopt(__doc__, version='Fourmi - V0.4.0')
|
||||
arguments = docopt.docopt(__doc__, version='Fourmi - V0.4.1')
|
||||
loader = SourceLoader()
|
||||
|
||||
if arguments["--include"]:
|
||||
|
@ -1,6 +1,7 @@
|
||||
import inspect
|
||||
import os
|
||||
import re
|
||||
|
||||
from FourmiCrawler.sources.source import Source
|
||||
|
||||
|
||||
@ -8,6 +9,10 @@ class SourceLoader:
|
||||
sources = []
|
||||
|
||||
def __init__(self, rel_dir="FourmiCrawler/sources"):
|
||||
"""
|
||||
The initiation of a SourceLoader, selects and indexes a directory for usable sources.
|
||||
:param rel_dir: A relative path to a directory.
|
||||
"""
|
||||
path = os.path.dirname(os.path.abspath(__file__))
|
||||
path += "/" + rel_dir
|
||||
known_parser = set()
|
||||
@ -21,18 +26,30 @@ class SourceLoader:
|
||||
known_parser.add(cls)
|
||||
|
||||
def include(self, source_names):
|
||||
"""
|
||||
This function excludes all sources that don't match the given regular expressions.
|
||||
:param source_names: A list of regular expression (strings)
|
||||
"""
|
||||
new = set()
|
||||
for name in source_names:
|
||||
new.update([src for src in self.sources if re.match(name, src.__class__.__name__)])
|
||||
self.sources = list(new)
|
||||
|
||||
def exclude(self, source_names):
|
||||
"""
|
||||
This function excludes all sources that match the given regular expressions.
|
||||
:param source_names: A list of regular expression (strings)
|
||||
"""
|
||||
exclude = []
|
||||
for name in source_names:
|
||||
exclude.extend([src for src in self.sources if re.match(name, src.__class__.__name__)])
|
||||
self.sources = [src for src in self.sources if src not in exclude]
|
||||
|
||||
def __str__(self):
|
||||
"""
|
||||
This function returns a string with all sources currently available in the SourceLoader.
|
||||
:return: a string with all available sources.
|
||||
"""
|
||||
string = ""
|
||||
for src in self.sources:
|
||||
string += "Source: " + src.__class__.__name__
|
||||
|
1
tests/__init__.py
Normal file
1
tests/__init__.py
Normal file
@ -0,0 +1 @@
|
||||
|
52
tests/test_pipeline.py
Normal file
52
tests/test_pipeline.py
Normal file
@ -0,0 +1,52 @@
|
||||
import copy
|
||||
import unittest
|
||||
|
||||
from scrapy.exceptions import DropItem
|
||||
|
||||
from FourmiCrawler import pipelines, spider, items
|
||||
|
||||
|
||||
class TestPipelines(unittest.TestCase):
|
||||
def setUp(self):
|
||||
self.testItem = items.Result()
|
||||
|
||||
def test_none_pipeline(self):
|
||||
# Testing the pipeline that replaces the None values in items.
|
||||
self.testItem["value"] = "abc"
|
||||
pipe = pipelines.RemoveNonePipeline()
|
||||
processed = pipe.process_item(self.testItem, spider.FourmiSpider())
|
||||
|
||||
self.assertTrue(processed["value"] == "abc")
|
||||
|
||||
for key in self.testItem:
|
||||
self.assertIsNotNone(processed[key])
|
||||
if key is not "value":
|
||||
self.assertIs(processed[key], "")
|
||||
|
||||
def test_duplicate_pipeline(self):
|
||||
# Testing the pipeline that removes duplicates.
|
||||
self.testItem["attribute"] = "test"
|
||||
self.testItem["value"] = "test"
|
||||
self.testItem["conditions"] = "test"
|
||||
|
||||
pipe = pipelines.DuplicatePipeline()
|
||||
self.assertEqual(pipe.process_item(self.testItem, spider.FourmiSpider()), self.testItem)
|
||||
self.assertRaises(DropItem, pipe.process_item, self.testItem, spider.FourmiSpider())
|
||||
|
||||
other_item = copy.deepcopy(self.testItem)
|
||||
other_item["value"] = "test1"
|
||||
self.assertEqual(pipe.process_item(other_item, spider.FourmiSpider()), other_item)
|
||||
|
||||
def test_attribute_selection(self):
|
||||
# Testing the pipeline that selects attributes.
|
||||
item1 = copy.deepcopy(self.testItem)
|
||||
item2 = copy.deepcopy(self.testItem)
|
||||
|
||||
item1["attribute"] = "abd"
|
||||
item2["attribute"] = "abc"
|
||||
|
||||
s = spider.FourmiSpider(selected_attributes=["a.d"])
|
||||
pipe = pipelines.AttributeSelectionPipeline()
|
||||
|
||||
self.assertEqual(pipe.process_item(item1, s), item1)
|
||||
self.assertRaises(DropItem, pipe.process_item, item2, s)
|
33
tests/test_sourceloader.py
Normal file
33
tests/test_sourceloader.py
Normal file
@ -0,0 +1,33 @@
|
||||
import unittest
|
||||
|
||||
from sourceloader import SourceLoader
|
||||
|
||||
|
||||
class TestSourceloader(unittest.TestCase):
|
||||
def setUp(self):
|
||||
self.loader = SourceLoader()
|
||||
|
||||
def test_init(self):
|
||||
# Test if sourceloader points to the right directory, where the sources are present.
|
||||
self.assertIn("Source: Source", str(self.loader))
|
||||
self.assertIn("Source: NIST", str(self.loader))
|
||||
self.assertIn("Source: ChemSpider", str(self.loader))
|
||||
self.assertIn("Source: WikipediaParser", str(self.loader))
|
||||
|
||||
def test_include(self):
|
||||
# Tests for the include functionality.
|
||||
self.loader.include(["So.rc.*"])
|
||||
|
||||
self.assertIn("Source: Source", str(self.loader))
|
||||
self.assertNotIn("Source: NIST", str(self.loader))
|
||||
self.assertNotIn("Source: ChemSpider", str(self.loader))
|
||||
self.assertNotIn("Source: WikipediaParser", str(self.loader))
|
||||
|
||||
def test_exclude(self):
|
||||
# Tests for the exclude functionality.
|
||||
self.loader.exclude(["So.rc.*"])
|
||||
|
||||
self.assertNotIn("Source: Source", str(self.loader))
|
||||
self.assertIn("Source: NIST", str(self.loader))
|
||||
self.assertIn("Source: ChemSpider", str(self.loader))
|
||||
self.assertIn("Source: WikipediaParser", str(self.loader))
|
61
tests/test_spider.py
Normal file
61
tests/test_spider.py
Normal file
@ -0,0 +1,61 @@
|
||||
import unittest
|
||||
|
||||
from scrapy.http import Request
|
||||
|
||||
from FourmiCrawler import spider
|
||||
from FourmiCrawler.sources.ChemSpider import ChemSpider
|
||||
from FourmiCrawler.sources.source import Source
|
||||
|
||||
|
||||
class TestFoumiSpider(unittest.TestCase):
|
||||
def setUp(self):
|
||||
self.compound = "test_compound"
|
||||
self.attributes = ["a.*", ".*a"]
|
||||
self.spi = spider.FourmiSpider(self.compound, self.attributes)
|
||||
|
||||
def test_init(self):
|
||||
# Test the initiation of the Fourmi spider
|
||||
self.assertIn(self.compound, self.spi.synonyms)
|
||||
for attr in self.attributes:
|
||||
self.assertIn(attr, self.spi.selected_attributes)
|
||||
|
||||
def test_add_source(self):
|
||||
# Testing the source adding function of the Fourmi spider
|
||||
src = Source()
|
||||
self.spi.add_source(src)
|
||||
self.assertIn(src, self.spi._sources)
|
||||
|
||||
def test_add_sources(self):
|
||||
# Testing the function that adds multiple sources
|
||||
srcs = [Source(), Source(), Source()]
|
||||
self.spi.add_sources(srcs)
|
||||
|
||||
for src in srcs:
|
||||
self.assertIn(src, self.spi._sources)
|
||||
|
||||
def test_start_requests(self):
|
||||
# A test for the function that generates the start requests
|
||||
self.spi._sources = []
|
||||
|
||||
src = Source()
|
||||
self.spi.add_source(src)
|
||||
self.assertEqual(self.spi.start_requests(), [])
|
||||
|
||||
src2 = ChemSpider()
|
||||
self.spi.add_source(src2)
|
||||
self.assertIsNotNone(self.spi.start_requests())
|
||||
|
||||
def test_synonym_requests(self):
|
||||
# A test for the synonym request function
|
||||
self.spi._sources = []
|
||||
|
||||
src = Source()
|
||||
self.spi.add_source(src)
|
||||
self.assertEqual(self.spi.get_synonym_requests("new_compound"), [])
|
||||
self.assertIn("new_compound", self.spi.synonyms)
|
||||
|
||||
src2 = ChemSpider()
|
||||
self.spi.add_source(src2)
|
||||
self.assertIsInstance(self.spi.get_synonym_requests("other_compound")[0], Request)
|
||||
self.assertIn("other_compound", self.spi.synonyms)
|
||||
self.assertEqual(self.spi.get_synonym_requests("other_compound"), [])
|
Reference in New Issue
Block a user