diff --git a/Fourmi.py b/Fourmi.py index 16029f9..f1bf1ba 100755 --- a/Fourmi.py +++ b/Fourmi.py @@ -1,22 +1,31 @@ #!/usr/bin/env python """ -Fourmi - An internet webcrawler searching for information on chemical compounds. -[todo] - Add some more useful text here. +Fourmi - An internet webcrawler searching for information on chemical +compounds. [todo] - Add some more useful text here. """ from twisted.internet import reactor from scrapy.crawler import Crawler from scrapy import log, signals -from FourmiCrawler.spiders.Chemspider import ChemspiderSpider # [review] - There should be an easy way to import all spiders! +from FourmiCrawler.spiders.Fourmispider import FourmiSpider from scrapy.utils.project import get_project_settings -# [todo] - Add something to add all spiders, with the right references -spider = ChemspiderSpider(compound = "Aspirin") -settings = get_project_settings() -crawler = Crawler(settings) -crawler.signals.connect(reactor.stop, signal=signals.spider_closed) -crawler.configure() -crawler.crawl(spider) -crawler.start() -log.start() -reactor.run() \ No newline at end of file + +def setup_crawler(searchable): + # [TODO] - Initiate all parsers for the different websites and get + # allowed URLs. + spider = FourmiSpider(compound=searchable) + settings = get_project_settings() + crawler = Crawler(settings) + crawler.signals.connect(reactor.stop, signal=signals.spider_closed) + crawler.configure() + crawler.crawl(spider) + crawler.start() + + +def start(): + setup_crawler("Methane") + log.start() + reactor.run() + +start() diff --git a/FourmiCrawler/items.py b/FourmiCrawler/items.py index 5fedc36..c7fd41c 100644 --- a/FourmiCrawler/items.py +++ b/FourmiCrawler/items.py @@ -5,9 +5,10 @@ from scrapy.item import Item, Field + class Result(Item): attribute = Field() value = Field() source = Field() reliability = Field() - conditions = Field() \ No newline at end of file + conditions = Field() diff --git a/FourmiCrawler/parsers/__init__.py b/FourmiCrawler/parsers/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/FourmiCrawler/parsers/parser.py b/FourmiCrawler/parsers/parser.py new file mode 100644 index 0000000..3362d59 --- /dev/null +++ b/FourmiCrawler/parsers/parser.py @@ -0,0 +1,9 @@ +from scrapy import log + + +class Parser: + website = "http://localhost/*" + + def parse(self, reponse): + log.msg("The parse function of the empty parser was used.", level=log.Warning) + pass diff --git a/FourmiCrawler/settings.py b/FourmiCrawler/settings.py index 0f5eae8..b025167 100644 --- a/FourmiCrawler/settings.py +++ b/FourmiCrawler/settings.py @@ -8,11 +8,13 @@ BOT_NAME = 'FourmiCrawler' -SPIDER_MODULES = ['FourmiCrawler.spiders'] -NEWSPIDER_MODULE = 'FourmiCrawler.spiders' +SPIDER_MODULES = ['FourmiCrawler'] +NEWSPIDER_MODULE = 'FourmiCrawler' ITEM_PIPELINES = { 'FourmiCrawler.pipelines.FourmiPipeline': 100 } -# Crawl responsibly by identifying yourself (and your website) on the user-agent -#USER_AGENT = 'FourmiCrawler (+http://www.yourdomain.com)' +# Crawl responsibly by identifying yourself (and your website) on the +# user-agent + +# USER_AGENT = 'FourmiCrawler (+http://www.yourdomain.com)' diff --git a/FourmiCrawler/spider.py b/FourmiCrawler/spider.py new file mode 100644 index 0000000..4c25df9 --- /dev/null +++ b/FourmiCrawler/spider.py @@ -0,0 +1,16 @@ +from scrapy.spider import Spider + + +class FourmiSpider(Spider): + name = "FourmiSpider" + + def __init__(self, compound=None, *args, **kwargs): + super(FourmiSpider, self).__init__(*args, **kwargs) + + def parse(self, reponse): + # [TODO] - This function should delegate it's functionality to other + # parsers. + pass + + def add_parser(self, parser): + self.parsers.add(parser) diff --git a/FourmiCrawler/spiders/Chemspider.py b/FourmiCrawler/spiders/Chemspider.py deleted file mode 100644 index b85b44d..0000000 --- a/FourmiCrawler/spiders/Chemspider.py +++ /dev/null @@ -1,12 +0,0 @@ -from scrapy.spider import Spider - -class ChemspiderSpider(Spider): - name = "Chemspider" - allowed_domains = ["chemspider.com"] - - def __init__(self, compound=None, *args, **kwargs): - super(ChemspiderSpider, self).__init__(*args, **kwargs) - self.start_urls = ["http://chemspiderapiurl/something/%s" % compound] #[TODO] - Give an logical start url. - - def parse(self, response): - pass diff --git a/FourmiCrawler/spiders/Wikipedia.py b/FourmiCrawler/spiders/Wikipedia.py deleted file mode 100644 index 62ed026..0000000 --- a/FourmiCrawler/spiders/Wikipedia.py +++ /dev/null @@ -1,12 +0,0 @@ -from scrapy.spider import Spider - -class WikipediaSpider(Spider): - name = "Wikipedia" - allowed_domains = ["wikipedia.org"] - - def __init__(self, compound=None, *args, **kwargs): - super(WikipediaSpider, self).__init__(*args, **kwargs) - self.start_urls = ["http://wikipediaurl/something/%s" % compound] #[TODO] - Give an logical start url. - - def parse(self, response): - pass diff --git a/FourmiCrawler/spiders/__init__.py b/FourmiCrawler/spiders/__init__.py deleted file mode 100644 index ebd689a..0000000 --- a/FourmiCrawler/spiders/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# This package will contain the spiders of your Scrapy project -# -# Please refer to the documentation for information on how to create and manage -# your spiders.