diff --git a/Fourmi.py b/Fourmi.py index 640f9f7..a0a9ead 100755 --- a/Fourmi.py +++ b/Fourmi.py @@ -7,13 +7,11 @@ Fourmi - An internet webcrawler searching for information on chemical compounds. from twisted.internet import reactor from scrapy.crawler import Crawler from scrapy import log, signals -from FourmiCrawler.spiders.Chemspider import ChemspiderSpider # [review] - There should be an easy way to import all spiders! +from FourmiCrawler.spiders.Fourmispider import FourmiSpider from scrapy.utils.project import get_project_settings -defined_spiders = [ChemspiderSpider(compound = "Methane")] - -def setup_crawler(Spider, compound): - spider = FollowAllSpider(domain=domain) # [todo] - Do something smart to get the different spiders to work here. +def setup_crawler(compound): + spider = FourmiSpider(domain=domain) # [todo] - Do something smart to get the different spiders to work here. settings = get_project_settings() crawler = Crawler(settings) crawler.configure() @@ -21,8 +19,7 @@ def setup_crawler(Spider, compound): crawler.start() def start(): - for spider in defined_spiders: - setup_crawler(spider, compound) + setup_crawler(compound) log.start() reactor.run() diff --git a/FourmiCrawler/spiders/Chemspider.py b/FourmiCrawler/spiders/Chemspider.py deleted file mode 100644 index b85b44d..0000000 --- a/FourmiCrawler/spiders/Chemspider.py +++ /dev/null @@ -1,12 +0,0 @@ -from scrapy.spider import Spider - -class ChemspiderSpider(Spider): - name = "Chemspider" - allowed_domains = ["chemspider.com"] - - def __init__(self, compound=None, *args, **kwargs): - super(ChemspiderSpider, self).__init__(*args, **kwargs) - self.start_urls = ["http://chemspiderapiurl/something/%s" % compound] #[TODO] - Give an logical start url. - - def parse(self, response): - pass diff --git a/FourmiCrawler/spiders/Fourmispider.py b/FourmiCrawler/spiders/Fourmispider.py new file mode 100644 index 0000000..f7b64bd --- /dev/null +++ b/FourmiCrawler/spiders/Fourmispider.py @@ -0,0 +1,12 @@ +from scrapy.spider import Spider + +class FourmiSpider(Spider): + name="FourmiSpider" + + def __init__(self, compound=None, *args, **kwargs): + super(FourmiSpider, self).__init__(*args, **kwargs) + # [TODO] - Initiate all parsers for the different websites and get allowed URLs. + + def parse(self, reponse): + # [TODO] - This function should delegate it's functionality to other parsers. + pass diff --git a/FourmiCrawler/spiders/Wikipedia.py b/FourmiCrawler/spiders/Wikipedia.py deleted file mode 100644 index 62ed026..0000000 --- a/FourmiCrawler/spiders/Wikipedia.py +++ /dev/null @@ -1,12 +0,0 @@ -from scrapy.spider import Spider - -class WikipediaSpider(Spider): - name = "Wikipedia" - allowed_domains = ["wikipedia.org"] - - def __init__(self, compound=None, *args, **kwargs): - super(WikipediaSpider, self).__init__(*args, **kwargs) - self.start_urls = ["http://wikipediaurl/something/%s" % compound] #[TODO] - Give an logical start url. - - def parse(self, response): - pass