diff --git a/Fourmi.py b/Fourmi.py index 9bdec24..96c808e 100755 --- a/Fourmi.py +++ b/Fourmi.py @@ -1,7 +1,7 @@ #!/usr/bin/env python """ -Fourmi - An internet webcrawler searching for information on chemical compounds. -[todo] - Add some more useful text here. +Fourmi - An internet webcrawler searching for information on chemical +compounds. [todo] - Add some more useful text here. """ from twisted.internet import reactor @@ -10,17 +10,19 @@ from scrapy import log, signals from FourmiCrawler.spiders.Fourmispider import FourmiSpider from scrapy.utils.project import get_project_settings + def setup_crawler(searchable): - spider = FourmiSpider(compound=searchable) # [todo] - Do something smart to get the different spiders to work here. - settings = get_project_settings() - crawler = Crawler(settings) - crawler.configure() - crawler.crawl(spider) - crawler.start() + spider = FourmiSpider(compound=searchable) + settings = get_project_settings() + crawler = Crawler(settings) + crawler.configure() + crawler.crawl(spider) + crawler.start() + def start(): - setup_crawler("Methane") - log.start() - reactor.run() + setup_crawler("Methane") + log.start() + reactor.run() start() diff --git a/FourmiCrawler/items.py b/FourmiCrawler/items.py index 5fedc36..c7fd41c 100644 --- a/FourmiCrawler/items.py +++ b/FourmiCrawler/items.py @@ -5,9 +5,10 @@ from scrapy.item import Item, Field + class Result(Item): attribute = Field() value = Field() source = Field() reliability = Field() - conditions = Field() \ No newline at end of file + conditions = Field() diff --git a/FourmiCrawler/settings.py b/FourmiCrawler/settings.py index 28272d0..b025167 100644 --- a/FourmiCrawler/settings.py +++ b/FourmiCrawler/settings.py @@ -14,5 +14,7 @@ ITEM_PIPELINES = { 'FourmiCrawler.pipelines.FourmiPipeline': 100 } -# Crawl responsibly by identifying yourself (and your website) on the user-agent -#USER_AGENT = 'FourmiCrawler (+http://www.yourdomain.com)' +# Crawl responsibly by identifying yourself (and your website) on the +# user-agent + +# USER_AGENT = 'FourmiCrawler (+http://www.yourdomain.com)' diff --git a/FourmiCrawler/spider.py b/FourmiCrawler/spider.py index f7b64bd..2805c8e 100644 --- a/FourmiCrawler/spider.py +++ b/FourmiCrawler/spider.py @@ -1,12 +1,15 @@ from scrapy.spider import Spider -class FourmiSpider(Spider): - name="FourmiSpider" - def __init__(self, compound=None, *args, **kwargs): - super(FourmiSpider, self).__init__(*args, **kwargs) - # [TODO] - Initiate all parsers for the different websites and get allowed URLs. - - def parse(self, reponse): - # [TODO] - This function should delegate it's functionality to other parsers. - pass +class FourmiSpider(Spider): + name = "FourmiSpider" + + def __init__(self, compound=None, *args, **kwargs): + super(FourmiSpider, self).__init__(*args, **kwargs) + # [TODO] - Initiate all parsers for the different websites and get + # allowed URLs. + + def parse(self, reponse): + # [TODO] - This function should delegate it's functionality to other + # parsers. + pass