Merge branch 'feature/basic-structure' into develop

2014-03-28 14:46:43 +01:00 · 2014-03-28 14:46:43 +01:00 · 6bbee865c4
commit 6bbee865c4
parent aa65bbd459 1e730e77ce
9 changed files with 55 additions and 46 deletions
--- a/Fourmi.py
+++ b/Fourmi.py
@ -1,22 +1,31 @@
 #!/usr/bin/env python
 """
-Fourmi - An internet webcrawler searching for information on chemical compounds.
-[todo] - Add some more useful text here.
+Fourmi - An internet webcrawler searching for information on chemical
+compounds. [todo] - Add some more useful text here.
 """

 from twisted.internet import reactor
 from scrapy.crawler import Crawler
 from scrapy import log, signals
-from FourmiCrawler.spiders.Chemspider import ChemspiderSpider # [review] - There should be an easy way to import all spiders!
+from FourmiCrawler.spiders.Fourmispider import FourmiSpider
 from scrapy.utils.project import get_project_settings

-# [todo] - Add something to add all spiders, with the right references
-spider = ChemspiderSpider(compound = "Aspirin")
-settings = get_project_settings()
-crawler = Crawler(settings)
-crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
-crawler.configure()
-crawler.crawl(spider)
-crawler.start()
-log.start()
-reactor.run()
+
+def setup_crawler(searchable):
+    # [TODO] - Initiate all parsers for the different websites and get
+    # allowed URLs.
+    spider = FourmiSpider(compound=searchable)
+    settings = get_project_settings()
+    crawler = Crawler(settings)
+    crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
+    crawler.configure()
+    crawler.crawl(spider)
+    crawler.start()
+
+
+def start():
+    setup_crawler("Methane")
+    log.start()
+    reactor.run()
+
+start()
--- a/FourmiCrawler/items.py
+++ b/FourmiCrawler/items.py
@ -5,9 +5,10 @@

 from scrapy.item import Item, Field

+
 class Result(Item):
    attribute = Field()
    value = Field()
    source = Field()
    reliability = Field()
-    conditions = Field()
+    conditions = Field()
--- a/FourmiCrawler/parsers/init.py
+++ b/FourmiCrawler/parsers/init.py
--- a/FourmiCrawler/parsers/parser.py
+++ b/FourmiCrawler/parsers/parser.py
@ -0,0 +1,9 @@
+from scrapy import log
+
+
+class Parser:
+    website = "http://localhost/*"
+
+    def parse(self, reponse):
+        log.msg("The parse function of the empty parser was used.", level=log.Warning)
+        pass
--- a/FourmiCrawler/settings.py
+++ b/FourmiCrawler/settings.py
@ -8,11 +8,13 @@

 BOT_NAME = 'FourmiCrawler'

-SPIDER_MODULES = ['FourmiCrawler.spiders']
-NEWSPIDER_MODULE = 'FourmiCrawler.spiders'
+SPIDER_MODULES = ['FourmiCrawler']
+NEWSPIDER_MODULE = 'FourmiCrawler'
 ITEM_PIPELINES = {
    'FourmiCrawler.pipelines.FourmiPipeline': 100
 }

-# Crawl responsibly by identifying yourself (and your website) on the user-agent
-#USER_AGENT = 'FourmiCrawler (+http://www.yourdomain.com)'
+# Crawl responsibly by identifying yourself (and your website) on the
+# user-agent
+
+# USER_AGENT = 'FourmiCrawler (+http://www.yourdomain.com)'
--- a/FourmiCrawler/spider.py
+++ b/FourmiCrawler/spider.py
@ -0,0 +1,16 @@
+from scrapy.spider import Spider
+
+
+class FourmiSpider(Spider):
+    name = "FourmiSpider"
+
+    def __init__(self, compound=None, *args, **kwargs):
+        super(FourmiSpider, self).__init__(*args, **kwargs)
+
+    def parse(self, reponse):
+        # [TODO] - This function should delegate it's functionality to other
+        # parsers.
+        pass
+
+    def add_parser(self, parser):
+        self.parsers.add(parser)
--- a/FourmiCrawler/spiders/Chemspider.py
+++ b/FourmiCrawler/spiders/Chemspider.py
@ -1,12 +0,0 @@
-from scrapy.spider import Spider
-
-class ChemspiderSpider(Spider):
-    name = "Chemspider"
-    allowed_domains = ["chemspider.com"]
-
-    def __init__(self, compound=None, *args, **kwargs):
-        super(ChemspiderSpider, self).__init__(*args, **kwargs)
-        self.start_urls = ["http://chemspiderapiurl/something/%s" % compound] #[TODO] - Give an logical start url.
-
-    def parse(self, response):
-        pass 
--- a/FourmiCrawler/spiders/Wikipedia.py
+++ b/FourmiCrawler/spiders/Wikipedia.py
@ -1,12 +0,0 @@
-from scrapy.spider import Spider
-
-class WikipediaSpider(Spider):
-    name = "Wikipedia"
-    allowed_domains = ["wikipedia.org"]
-
-    def __init__(self, compound=None, *args, **kwargs):
-        super(WikipediaSpider, self).__init__(*args, **kwargs)
-        self.start_urls = ["http://wikipediaurl/something/%s" % compound] #[TODO] - Give an logical start url.
-
-    def parse(self, response):
-        pass 
--- a/FourmiCrawler/spiders/init.py
+++ b/FourmiCrawler/spiders/init.py
@ -1,4 +0,0 @@
-# This package will contain the spiders of your Scrapy project
-#
-# Please refer to the documentation for information on how to create and manage
-# your spiders.