Merge branch 'feature/basic-structure' into develop

2014-03-18 18:10:03 +01:00 · 2014-03-18 18:10:03 +01:00 · aa65bbd459
commit aa65bbd459
parent b239e474ca 847b4f201b
12 changed files with 79 additions and 35 deletions
--- a/Fourmi.py
+++ b/Fourmi.py
@ -0,0 +1,22 @@
+#!/usr/bin/env python
+"""
+Fourmi - An internet webcrawler searching for information on chemical compounds.
+[todo] - Add some more useful text here.
+"""
+
+from twisted.internet import reactor
+from scrapy.crawler import Crawler
+from scrapy import log, signals
+from FourmiCrawler.spiders.Chemspider import ChemspiderSpider # [review] - There should be an easy way to import all spiders!
+from scrapy.utils.project import get_project_settings
+
+# [todo] - Add something to add all spiders, with the right references
+spider = ChemspiderSpider(compound = "Aspirin")
+settings = get_project_settings()
+crawler = Crawler(settings)
+crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
+crawler.configure()
+crawler.crawl(spider)
+crawler.start()
+log.start()
+reactor.run()
--- a/FourmiCrawler/init.py
+++ b/FourmiCrawler/init.py
--- a/FourmiCrawler/items.py
+++ b/FourmiCrawler/items.py
--- a/FourmiCrawler/pipelines.py
+++ b/FourmiCrawler/pipelines.py
@ -0,0 +1,25 @@
+# Define your item pipelines here
+#
+# Don't forget to add your pipeline to the ITEM_PIPELINES setting
+# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
+from scrapy.exceptions import DropItem
+
+
+class FourmiPipeline(object):
+
+    def __init__(self):
+        self.known_values = set()
+
+    def process_item(self, item, spider):
+        """
+        Processing the items so exact doubles are dropped
+        :param item: The incoming item
+        :param spider: The spider which scraped the spider
+        :return: :raise DropItem: Returns the item if unique or drops them if it's already known
+        """
+        value = item['attribute'], item['value']
+        if value in self.known_values:
+            raise DropItem("Duplicate item found: %s" % item)
+        else:
+            self.known_values.add(value)
+            return item
--- a/FourmiCrawler/settings.py
+++ b/FourmiCrawler/settings.py
@ -6,10 +6,13 @@
 #     http://doc.scrapy.org/en/latest/topics/settings.html
 #

-BOT_NAME = 'Fourmi'
+BOT_NAME = 'FourmiCrawler'

-SPIDER_MODULES = ['Scrapy.spiders']
-NEWSPIDER_MODULE = 'Scrapy.spiders'
+SPIDER_MODULES = ['FourmiCrawler.spiders']
+NEWSPIDER_MODULE = 'FourmiCrawler.spiders'
+ITEM_PIPELINES = {
+    'FourmiCrawler.pipelines.FourmiPipeline': 100
+}

 # Crawl responsibly by identifying yourself (and your website) on the user-agent
-#USER_AGENT = 'Fourmi (+http://www.yourdomain.com)'
+#USER_AGENT = 'FourmiCrawler (+http://www.yourdomain.com)'
--- a/FourmiCrawler/spiders/Chemspider.py
+++ b/FourmiCrawler/spiders/Chemspider.py
@ -0,0 +1,12 @@
+from scrapy.spider import Spider
+
+class ChemspiderSpider(Spider):
+    name = "Chemspider"
+    allowed_domains = ["chemspider.com"]
+
+    def __init__(self, compound=None, *args, **kwargs):
+        super(ChemspiderSpider, self).__init__(*args, **kwargs)
+        self.start_urls = ["http://chemspiderapiurl/something/%s" % compound] #[TODO] - Give an logical start url.
+
+    def parse(self, response):
+        pass 
--- a/FourmiCrawler/spiders/Wikipedia.py
+++ b/FourmiCrawler/spiders/Wikipedia.py
@ -0,0 +1,12 @@
+from scrapy.spider import Spider
+
+class WikipediaSpider(Spider):
+    name = "Wikipedia"
+    allowed_domains = ["wikipedia.org"]
+
+    def __init__(self, compound=None, *args, **kwargs):
+        super(WikipediaSpider, self).__init__(*args, **kwargs)
+        self.start_urls = ["http://wikipediaurl/something/%s" % compound] #[TODO] - Give an logical start url.
+
+    def parse(self, response):
+        pass 
--- a/FourmiCrawler/spiders/init.py
+++ b/FourmiCrawler/spiders/init.py
--- a/Scrapy/pipelines.py
+++ b/Scrapy/pipelines.py
@ -1,8 +0,0 @@
-# Define your item pipelines here
-#
-# Don't forget to add your pipeline to the ITEM_PIPELINES setting
-# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
-
-class FourmiPipeline(object):
-    def process_item(self, item, spider):
-        return item
--- a/Scrapy/spiders/Chemspider.py
+++ b/Scrapy/spiders/Chemspider.py
@ -1,11 +0,0 @@
-from scrapy.spider import Spider
-
-class ChemspiderSpider(Spider):
-    name = "Chemspider"
-    allowed_domains = ["chemspider.com"]
-    start_urls = (
-        'http://www.chemspider.com/',
-        )
-
-    def parse(self, response):
-        pass 
--- a/Scrapy/spiders/Wikipedia.py
+++ b/Scrapy/spiders/Wikipedia.py
@ -1,11 +0,0 @@
-from scrapy.spider import Spider
-
-class WikipediaSpider(Spider):
-    name = "Wikipedia"
-    allowed_domains = ["wikipedia.org"]
-    start_urls = (
-        'http://www.wikipedia.org/',
-        )
-
-    def parse(self, response):
-        pass 
--- a/scrapy.cfg
+++ b/scrapy.cfg
@ -4,7 +4,7 @@
 # http://doc.scrapy.org/en/latest/topics/scrapyd.html

 [settings]
-default = Scrapy.settings
+default = FourmiCrawler.settings

 [deploy]
 #url = http://localhost:6800/