Merge branch 'release/basic-scraper-structure'

2014-03-30 22:16:13 +02:00 · 2014-03-30 22:16:13 +02:00 · e0556bbf16
commit e0556bbf16
parent f25af46aa0 e210ce8558
15 changed files with 102 additions and 22 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,6 +1,9 @@
 #EDITOR AND IDE SPECIFIC SETTINGFILES
 .idea
 #Python Specific ignores
 *.pyc
 #THINGS WE WOULD NEVER EVER WANT!
 #ignore thumbnails created by windows
 Thumbs.db
--- a/Fourmi.py
+++ b/Fourmi.py
@ -0,0 +1,31 @@
 #!/usr/bin/env python
 """
 Fourmi - An internet webcrawler searching for information on chemical
 compounds. [todo] - Add some more useful text here.
 """
 from twisted.internet import reactor
 from scrapy.crawler import Crawler
 from scrapy import log, signals
 from FourmiCrawler.spiders.Fourmispider import FourmiSpider
 from scrapy.utils.project import get_project_settings
 def setup_crawler(searchable):
    # [TODO] - Initiate all parsers for the different websites and get
    # allowed URLs.
    spider = FourmiSpider(compound=searchable)
    settings = get_project_settings()
    crawler = Crawler(settings)
    crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
    crawler.configure()
    crawler.crawl(spider)
    crawler.start()
 def start():
    setup_crawler("Methane")
    log.start()
    reactor.run()
 start()
--- a/FourmiCrawler/init.py
+++ b/FourmiCrawler/init.py
--- a/FourmiCrawler/items.py
+++ b/FourmiCrawler/items.py
@ -5,7 +5,10 @@
 from scrapy.item import Item, Field
-class FourmiItem(Item):
+
-    # define the fields for your item here like:
+class Result(Item):
-    # name = Field()
+    attribute = Field()
-    pass
+    value = Field()
    source = Field()
    reliability = Field()
    conditions = Field()
--- a/FourmiCrawler/parsers/init.py
+++ b/FourmiCrawler/parsers/init.py
--- a/FourmiCrawler/parsers/parser.py
+++ b/FourmiCrawler/parsers/parser.py
@ -0,0 +1,9 @@
 from scrapy import log
 class Parser:
    website = "http://localhost/*"
    def parse(self, reponse):
        log.msg("The parse function of the empty parser was used.", level=log.Warning)
        pass
--- a/FourmiCrawler/pipelines.py
+++ b/FourmiCrawler/pipelines.py
@ -0,0 +1,25 @@
 # Define your item pipelines here
 #
 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 from scrapy.exceptions import DropItem
 class FourmiPipeline(object):
    def __init__(self):
        self.known_values = set()
    def process_item(self, item, spider):
        """
        Processing the items so exact doubles are dropped
        :param item: The incoming item
        :param spider: The spider which scraped the spider
        :return: :raise DropItem: Returns the item if unique or drops them if it's already known
        """
        value = item['attribute'], item['value']
        if value in self.known_values:
            raise DropItem("Duplicate item found: %s" % item)
        else:
            self.known_values.add(value)
            return item
--- a/FourmiCrawler/settings.py
+++ b/FourmiCrawler/settings.py
@ -6,10 +6,15 @@
 #     http://doc.scrapy.org/en/latest/topics/settings.html
 #
-BOT_NAME = 'Fourmi'
+BOT_NAME = 'FourmiCrawler'
-SPIDER_MODULES = ['Scrapy.spiders']
+SPIDER_MODULES = ['FourmiCrawler']
-NEWSPIDER_MODULE = 'Scrapy.spiders'
+NEWSPIDER_MODULE = 'FourmiCrawler'
 ITEM_PIPELINES = {
    'FourmiCrawler.pipelines.FourmiPipeline': 100
 }
-# Crawl responsibly by identifying yourself (and your website) on the user-agent
+# Crawl responsibly by identifying yourself (and your website) on the
-#USER_AGENT = 'Fourmi (+http://www.yourdomain.com)'
+# user-agent
 # USER_AGENT = 'FourmiCrawler (+http://www.yourdomain.com)'
--- a/FourmiCrawler/spider.py
+++ b/FourmiCrawler/spider.py
@ -0,0 +1,16 @@
 from scrapy.spider import Spider
 class FourmiSpider(Spider):
    name = "FourmiSpider"
    def __init__(self, compound=None, *args, **kwargs):
        super(FourmiSpider, self).__init__(*args, **kwargs)
    def parse(self, reponse):
        # [TODO] - This function should delegate it's functionality to other
        # parsers.
        pass
    def add_parser(self, parser):
        self.parsers.add(parser)
--- a/Scrapy/init.pyc
+++ b/Scrapy/init.pyc
--- a/Scrapy/pipelines.py
+++ b/Scrapy/pipelines.py
@ -1,8 +0,0 @@
 # Define your item pipelines here
 #
 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 class FourmiPipeline(object):
    def process_item(self, item, spider):
        return item
--- a/Scrapy/settings.pyc
+++ b/Scrapy/settings.pyc
--- a/Scrapy/spiders/init.py
+++ b/Scrapy/spiders/init.py
@ -1,4 +0,0 @@
 # This package will contain the spiders of your Scrapy project
 #
 # Please refer to the documentation for information on how to create and manage
 # your spiders.
--- a/Scrapy/spiders/init.pyc
+++ b/Scrapy/spiders/init.pyc
--- a/scrapy.cfg
+++ b/scrapy.cfg
@ -4,7 +4,7 @@
 # http://doc.scrapy.org/en/latest/topics/scrapyd.html
 [settings]
-default = Scrapy.settings
+default = FourmiCrawler.settings
 [deploy]
 #url = http://localhost:6800/