Merge branch 'release/basic-scraper-structure'

2014-03-30 22:16:13 +02:00 · 2014-03-30 22:16:13 +02:00 · e0556bbf16
commit e0556bbf16
parent f25af46aa0 e210ce8558
15 changed files with 102 additions and 22 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,6 +1,9 @@
 #EDITOR AND IDE SPECIFIC SETTINGFILES
 .idea

+#Python Specific ignores
+*.pyc
+
 #THINGS WE WOULD NEVER EVER WANT!
 #ignore thumbnails created by windows
 Thumbs.db
--- a/Fourmi.py
+++ b/Fourmi.py
@ -0,0 +1,31 @@
+#!/usr/bin/env python
+"""
+Fourmi - An internet webcrawler searching for information on chemical
+compounds. [todo] - Add some more useful text here.
+"""
+
+from twisted.internet import reactor
+from scrapy.crawler import Crawler
+from scrapy import log, signals
+from FourmiCrawler.spiders.Fourmispider import FourmiSpider
+from scrapy.utils.project import get_project_settings
+
+
+def setup_crawler(searchable):
+    # [TODO] - Initiate all parsers for the different websites and get
+    # allowed URLs.
+    spider = FourmiSpider(compound=searchable)
+    settings = get_project_settings()
+    crawler = Crawler(settings)
+    crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
+    crawler.configure()
+    crawler.crawl(spider)
+    crawler.start()
+
+
+def start():
+    setup_crawler("Methane")
+    log.start()
+    reactor.run()
+
+start()
--- a/FourmiCrawler/init.py
+++ b/FourmiCrawler/init.py
--- a/FourmiCrawler/items.py
+++ b/FourmiCrawler/items.py
@ -5,7 +5,10 @@

 from scrapy.item import Item, Field

-class FourmiItem(Item):
-    # define the fields for your item here like:
-    # name = Field()
-    pass
+
+class Result(Item):
+    attribute = Field()
+    value = Field()
+    source = Field()
+    reliability = Field()
+    conditions = Field()
--- a/FourmiCrawler/parsers/init.py
+++ b/FourmiCrawler/parsers/init.py
--- a/FourmiCrawler/parsers/parser.py
+++ b/FourmiCrawler/parsers/parser.py
@ -0,0 +1,9 @@
+from scrapy import log
+
+
+class Parser:
+    website = "http://localhost/*"
+
+    def parse(self, reponse):
+        log.msg("The parse function of the empty parser was used.", level=log.Warning)
+        pass
--- a/FourmiCrawler/pipelines.py
+++ b/FourmiCrawler/pipelines.py
@ -0,0 +1,25 @@
+# Define your item pipelines here
+#
+# Don't forget to add your pipeline to the ITEM_PIPELINES setting
+# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
+from scrapy.exceptions import DropItem
+
+
+class FourmiPipeline(object):
+
+    def __init__(self):
+        self.known_values = set()
+
+    def process_item(self, item, spider):
+        """
+        Processing the items so exact doubles are dropped
+        :param item: The incoming item
+        :param spider: The spider which scraped the spider
+        :return: :raise DropItem: Returns the item if unique or drops them if it's already known
+        """
+        value = item['attribute'], item['value']
+        if value in self.known_values:
+            raise DropItem("Duplicate item found: %s" % item)
+        else:
+            self.known_values.add(value)
+            return item
--- a/FourmiCrawler/settings.py
+++ b/FourmiCrawler/settings.py
@ -6,10 +6,15 @@
 #     http://doc.scrapy.org/en/latest/topics/settings.html
 #

-BOT_NAME = 'Fourmi'
+BOT_NAME = 'FourmiCrawler'

-SPIDER_MODULES = ['Scrapy.spiders']
-NEWSPIDER_MODULE = 'Scrapy.spiders'
+SPIDER_MODULES = ['FourmiCrawler']
+NEWSPIDER_MODULE = 'FourmiCrawler'
+ITEM_PIPELINES = {
+    'FourmiCrawler.pipelines.FourmiPipeline': 100
+}

-# Crawl responsibly by identifying yourself (and your website) on the user-agent
-#USER_AGENT = 'Fourmi (+http://www.yourdomain.com)'
+# Crawl responsibly by identifying yourself (and your website) on the
+# user-agent
+
+# USER_AGENT = 'FourmiCrawler (+http://www.yourdomain.com)'
--- a/FourmiCrawler/spider.py
+++ b/FourmiCrawler/spider.py
@ -0,0 +1,16 @@
+from scrapy.spider import Spider
+
+
+class FourmiSpider(Spider):
+    name = "FourmiSpider"
+
+    def __init__(self, compound=None, *args, **kwargs):
+        super(FourmiSpider, self).__init__(*args, **kwargs)
+
+    def parse(self, reponse):
+        # [TODO] - This function should delegate it's functionality to other
+        # parsers.
+        pass
+
+    def add_parser(self, parser):
+        self.parsers.add(parser)
--- a/Scrapy/init.pyc
+++ b/Scrapy/init.pyc
--- a/Scrapy/pipelines.py
+++ b/Scrapy/pipelines.py
@ -1,8 +0,0 @@
-# Define your item pipelines here
-#
-# Don't forget to add your pipeline to the ITEM_PIPELINES setting
-# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
-
-class FourmiPipeline(object):
-    def process_item(self, item, spider):
-        return item
--- a/Scrapy/settings.pyc
+++ b/Scrapy/settings.pyc
--- a/Scrapy/spiders/init.py
+++ b/Scrapy/spiders/init.py
@ -1,4 +0,0 @@
-# This package will contain the spiders of your Scrapy project
-#
-# Please refer to the documentation for information on how to create and manage
-# your spiders.
--- a/Scrapy/spiders/init.pyc
+++ b/Scrapy/spiders/init.pyc
--- a/scrapy.cfg
+++ b/scrapy.cfg
@ -4,7 +4,7 @@
 # http://doc.scrapy.org/en/latest/topics/scrapyd.html

 [settings]
-default = Scrapy.settings
+default = FourmiCrawler.settings

 [deploy]
 #url = http://localhost:6800/