Merge branch 'feature/basic-structure' into develop

2014-03-18 18:10:03 +01:00 · 2014-03-18 18:10:03 +01:00 · aa65bbd459
commit aa65bbd459
parent b239e474ca 847b4f201b
12 changed files with 79 additions and 35 deletions
--- a/Fourmi.py
+++ b/Fourmi.py
@ -0,0 +1,22 @@
 #!/usr/bin/env python
 """
 Fourmi - An internet webcrawler searching for information on chemical compounds.
 [todo] - Add some more useful text here.
 """
 from twisted.internet import reactor
 from scrapy.crawler import Crawler
 from scrapy import log, signals
 from FourmiCrawler.spiders.Chemspider import ChemspiderSpider # [review] - There should be an easy way to import all spiders!
 from scrapy.utils.project import get_project_settings
 # [todo] - Add something to add all spiders, with the right references
 spider = ChemspiderSpider(compound = "Aspirin")
 settings = get_project_settings()
 crawler = Crawler(settings)
 crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
 crawler.configure()
 crawler.crawl(spider)
 crawler.start()
 log.start()
 reactor.run()
--- a/FourmiCrawler/init.py
+++ b/FourmiCrawler/init.py
--- a/FourmiCrawler/items.py
+++ b/FourmiCrawler/items.py
--- a/FourmiCrawler/pipelines.py
+++ b/FourmiCrawler/pipelines.py
@ -0,0 +1,25 @@
 # Define your item pipelines here
 #
 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 from scrapy.exceptions import DropItem
 class FourmiPipeline(object):
    def __init__(self):
        self.known_values = set()
    def process_item(self, item, spider):
        """
        Processing the items so exact doubles are dropped
        :param item: The incoming item
        :param spider: The spider which scraped the spider
        :return: :raise DropItem: Returns the item if unique or drops them if it's already known
        """
        value = item['attribute'], item['value']
        if value in self.known_values:
            raise DropItem("Duplicate item found: %s" % item)
        else:
            self.known_values.add(value)
            return item
--- a/FourmiCrawler/settings.py
+++ b/FourmiCrawler/settings.py
@ -6,10 +6,13 @@
 #     http://doc.scrapy.org/en/latest/topics/settings.html
 #
-BOT_NAME = 'Fourmi'
+BOT_NAME = 'FourmiCrawler'
-SPIDER_MODULES = ['Scrapy.spiders']
+SPIDER_MODULES = ['FourmiCrawler.spiders']
-NEWSPIDER_MODULE = 'Scrapy.spiders'
+NEWSPIDER_MODULE = 'FourmiCrawler.spiders'
 ITEM_PIPELINES = {
    'FourmiCrawler.pipelines.FourmiPipeline': 100
 }
 # Crawl responsibly by identifying yourself (and your website) on the user-agent
-#USER_AGENT = 'Fourmi (+http://www.yourdomain.com)'
+#USER_AGENT = 'FourmiCrawler (+http://www.yourdomain.com)'
--- a/FourmiCrawler/spiders/Chemspider.py
+++ b/FourmiCrawler/spiders/Chemspider.py
@ -0,0 +1,12 @@
 from scrapy.spider import Spider
 class ChemspiderSpider(Spider):
    name = "Chemspider"
    allowed_domains = ["chemspider.com"]
    def __init__(self, compound=None, *args, **kwargs):
        super(ChemspiderSpider, self).__init__(*args, **kwargs)
        self.start_urls = ["http://chemspiderapiurl/something/%s" % compound] #[TODO] - Give an logical start url.
    def parse(self, response):
        pass 
--- a/FourmiCrawler/spiders/Wikipedia.py
+++ b/FourmiCrawler/spiders/Wikipedia.py
@ -0,0 +1,12 @@
 from scrapy.spider import Spider
 class WikipediaSpider(Spider):
    name = "Wikipedia"
    allowed_domains = ["wikipedia.org"]
    def __init__(self, compound=None, *args, **kwargs):
        super(WikipediaSpider, self).__init__(*args, **kwargs)
        self.start_urls = ["http://wikipediaurl/something/%s" % compound] #[TODO] - Give an logical start url.
    def parse(self, response):
        pass 
--- a/FourmiCrawler/spiders/init.py
+++ b/FourmiCrawler/spiders/init.py
--- a/Scrapy/pipelines.py
+++ b/Scrapy/pipelines.py
@ -1,8 +0,0 @@
 # Define your item pipelines here
 #
 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 class FourmiPipeline(object):
    def process_item(self, item, spider):
        return item
--- a/Scrapy/spiders/Chemspider.py
+++ b/Scrapy/spiders/Chemspider.py
@ -1,11 +0,0 @@
 from scrapy.spider import Spider
 class ChemspiderSpider(Spider):
    name = "Chemspider"
    allowed_domains = ["chemspider.com"]
    start_urls = (
        'http://www.chemspider.com/',
        )
    def parse(self, response):
        pass 
--- a/Scrapy/spiders/Wikipedia.py
+++ b/Scrapy/spiders/Wikipedia.py
@ -1,11 +0,0 @@
 from scrapy.spider import Spider
 class WikipediaSpider(Spider):
    name = "Wikipedia"
    allowed_domains = ["wikipedia.org"]
    start_urls = (
        'http://www.wikipedia.org/',
        )
    def parse(self, response):
        pass 
--- a/scrapy.cfg
+++ b/scrapy.cfg
@ -4,7 +4,7 @@
 # http://doc.scrapy.org/en/latest/topics/scrapyd.html
 [settings]
-default = Scrapy.settings
+default = FourmiCrawler.settings
 [deploy]
 #url = http://localhost:6800/