From 55843d320c54b7c7a39c398170d657d07ee80c71 Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Mon, 17 Mar 2014 16:25:48 +0100 Subject: [PATCH 1/7] Added an formal pipeline to make sure that we don't supply double values. --- Scrapy/pipelines.py | 19 ++++++++++++++++++- Scrapy/settings.py | 3 +++ 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/Scrapy/pipelines.py b/Scrapy/pipelines.py index 3345787..3194d7e 100644 --- a/Scrapy/pipelines.py +++ b/Scrapy/pipelines.py @@ -2,7 +2,24 @@ # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html +from scrapy.exceptions import DropItem + class FourmiPipeline(object): + + def __init__(self): + self.known_values = set() + def process_item(self, item, spider): - return item + """ + Processing the items so exact doubles are dropped + :param item: The incoming item + :param spider: The spider which scraped the spider + :return: :raise DropItem: Returns the item if unique or drops them if it's already known + """ + value = item['attribute'], item['value'] + if value in self.known_values: + raise DropItem("Duplicate item found: %s" % item) + else: + self.known_values.add(value) + return item diff --git a/Scrapy/settings.py b/Scrapy/settings.py index e43aa2b..fd379a9 100644 --- a/Scrapy/settings.py +++ b/Scrapy/settings.py @@ -10,6 +10,9 @@ BOT_NAME = 'Fourmi' SPIDER_MODULES = ['Scrapy.spiders'] NEWSPIDER_MODULE = 'Scrapy.spiders' +ITEM_PIPELINES = { + 'Scrapy.pipelines.FourmiPipeline': 100 +} # Crawl responsibly by identifying yourself (and your website) on the user-agent #USER_AGENT = 'Fourmi (+http://www.yourdomain.com)' From 2cb21c6b810624cf5569f95828b326fc1ccc1996 Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Mon, 17 Mar 2014 16:38:13 +0100 Subject: [PATCH 2/7] Moved the Scrapy to an other namespace, should prevent some importing faults. --- {Scrapy => Fourmi}/__init__.py | 0 {Scrapy => Fourmi}/items.py | 0 {Scrapy => Fourmi}/pipelines.py | 0 {Scrapy => Fourmi}/settings.py | 6 +++--- {Scrapy => Fourmi}/spiders/Chemspider.py | 0 {Scrapy => Fourmi}/spiders/Wikipedia.py | 0 {Scrapy => Fourmi}/spiders/__init__.py | 0 7 files changed, 3 insertions(+), 3 deletions(-) rename {Scrapy => Fourmi}/__init__.py (100%) rename {Scrapy => Fourmi}/items.py (100%) rename {Scrapy => Fourmi}/pipelines.py (100%) rename {Scrapy => Fourmi}/settings.py (77%) rename {Scrapy => Fourmi}/spiders/Chemspider.py (100%) rename {Scrapy => Fourmi}/spiders/Wikipedia.py (100%) rename {Scrapy => Fourmi}/spiders/__init__.py (100%) diff --git a/Scrapy/__init__.py b/Fourmi/__init__.py similarity index 100% rename from Scrapy/__init__.py rename to Fourmi/__init__.py diff --git a/Scrapy/items.py b/Fourmi/items.py similarity index 100% rename from Scrapy/items.py rename to Fourmi/items.py diff --git a/Scrapy/pipelines.py b/Fourmi/pipelines.py similarity index 100% rename from Scrapy/pipelines.py rename to Fourmi/pipelines.py diff --git a/Scrapy/settings.py b/Fourmi/settings.py similarity index 77% rename from Scrapy/settings.py rename to Fourmi/settings.py index fd379a9..a24e6f6 100644 --- a/Scrapy/settings.py +++ b/Fourmi/settings.py @@ -8,10 +8,10 @@ BOT_NAME = 'Fourmi' -SPIDER_MODULES = ['Scrapy.spiders'] -NEWSPIDER_MODULE = 'Scrapy.spiders' +SPIDER_MODULES = ['Fourmi.spiders'] +NEWSPIDER_MODULE = 'Fourmi.spiders' ITEM_PIPELINES = { - 'Scrapy.pipelines.FourmiPipeline': 100 + 'Fourmi.pipelines.FourmiPipeline': 100 } # Crawl responsibly by identifying yourself (and your website) on the user-agent diff --git a/Scrapy/spiders/Chemspider.py b/Fourmi/spiders/Chemspider.py similarity index 100% rename from Scrapy/spiders/Chemspider.py rename to Fourmi/spiders/Chemspider.py diff --git a/Scrapy/spiders/Wikipedia.py b/Fourmi/spiders/Wikipedia.py similarity index 100% rename from Scrapy/spiders/Wikipedia.py rename to Fourmi/spiders/Wikipedia.py diff --git a/Scrapy/spiders/__init__.py b/Fourmi/spiders/__init__.py similarity index 100% rename from Scrapy/spiders/__init__.py rename to Fourmi/spiders/__init__.py From 4f5b66fff69b68666d4e114afa2d53382476b985 Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Tue, 18 Mar 2014 17:28:49 +0100 Subject: [PATCH 3/7] Basic structure to make sure the spider use an argument --- Fourmi/spiders/Chemspider.py | 7 ++++--- Fourmi/spiders/Wikipedia.py | 7 ++++--- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/Fourmi/spiders/Chemspider.py b/Fourmi/spiders/Chemspider.py index 3fc74a0..b85b44d 100644 --- a/Fourmi/spiders/Chemspider.py +++ b/Fourmi/spiders/Chemspider.py @@ -3,9 +3,10 @@ from scrapy.spider import Spider class ChemspiderSpider(Spider): name = "Chemspider" allowed_domains = ["chemspider.com"] - start_urls = ( - 'http://www.chemspider.com/', - ) + + def __init__(self, compound=None, *args, **kwargs): + super(ChemspiderSpider, self).__init__(*args, **kwargs) + self.start_urls = ["http://chemspiderapiurl/something/%s" % compound] #[TODO] - Give an logical start url. def parse(self, response): pass diff --git a/Fourmi/spiders/Wikipedia.py b/Fourmi/spiders/Wikipedia.py index 03b202b..62ed026 100644 --- a/Fourmi/spiders/Wikipedia.py +++ b/Fourmi/spiders/Wikipedia.py @@ -3,9 +3,10 @@ from scrapy.spider import Spider class WikipediaSpider(Spider): name = "Wikipedia" allowed_domains = ["wikipedia.org"] - start_urls = ( - 'http://www.wikipedia.org/', - ) + + def __init__(self, compound=None, *args, **kwargs): + super(WikipediaSpider, self).__init__(*args, **kwargs) + self.start_urls = ["http://wikipediaurl/something/%s" % compound] #[TODO] - Give an logical start url. def parse(self, response): pass From b1840d3a658918281e3bfcd831c375efcb625841 Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Tue, 18 Mar 2014 17:41:40 +0100 Subject: [PATCH 4/7] Another name change to accommodate an executable script --- {Fourmi => FourmiCrawler}/__init__.py | 0 {Fourmi => FourmiCrawler}/items.py | 0 {Fourmi => FourmiCrawler}/pipelines.py | 0 {Fourmi => FourmiCrawler}/settings.py | 10 +++++----- {Fourmi => FourmiCrawler}/spiders/Chemspider.py | 0 {Fourmi => FourmiCrawler}/spiders/Wikipedia.py | 0 {Fourmi => FourmiCrawler}/spiders/__init__.py | 0 scrapy.cfg | 2 +- 8 files changed, 6 insertions(+), 6 deletions(-) rename {Fourmi => FourmiCrawler}/__init__.py (100%) rename {Fourmi => FourmiCrawler}/items.py (100%) rename {Fourmi => FourmiCrawler}/pipelines.py (100%) rename {Fourmi => FourmiCrawler}/settings.py (60%) rename {Fourmi => FourmiCrawler}/spiders/Chemspider.py (100%) rename {Fourmi => FourmiCrawler}/spiders/Wikipedia.py (100%) rename {Fourmi => FourmiCrawler}/spiders/__init__.py (100%) diff --git a/Fourmi/__init__.py b/FourmiCrawler/__init__.py similarity index 100% rename from Fourmi/__init__.py rename to FourmiCrawler/__init__.py diff --git a/Fourmi/items.py b/FourmiCrawler/items.py similarity index 100% rename from Fourmi/items.py rename to FourmiCrawler/items.py diff --git a/Fourmi/pipelines.py b/FourmiCrawler/pipelines.py similarity index 100% rename from Fourmi/pipelines.py rename to FourmiCrawler/pipelines.py diff --git a/Fourmi/settings.py b/FourmiCrawler/settings.py similarity index 60% rename from Fourmi/settings.py rename to FourmiCrawler/settings.py index a24e6f6..0f5eae8 100644 --- a/Fourmi/settings.py +++ b/FourmiCrawler/settings.py @@ -6,13 +6,13 @@ # http://doc.scrapy.org/en/latest/topics/settings.html # -BOT_NAME = 'Fourmi' +BOT_NAME = 'FourmiCrawler' -SPIDER_MODULES = ['Fourmi.spiders'] -NEWSPIDER_MODULE = 'Fourmi.spiders' +SPIDER_MODULES = ['FourmiCrawler.spiders'] +NEWSPIDER_MODULE = 'FourmiCrawler.spiders' ITEM_PIPELINES = { - 'Fourmi.pipelines.FourmiPipeline': 100 + 'FourmiCrawler.pipelines.FourmiPipeline': 100 } # Crawl responsibly by identifying yourself (and your website) on the user-agent -#USER_AGENT = 'Fourmi (+http://www.yourdomain.com)' +#USER_AGENT = 'FourmiCrawler (+http://www.yourdomain.com)' diff --git a/Fourmi/spiders/Chemspider.py b/FourmiCrawler/spiders/Chemspider.py similarity index 100% rename from Fourmi/spiders/Chemspider.py rename to FourmiCrawler/spiders/Chemspider.py diff --git a/Fourmi/spiders/Wikipedia.py b/FourmiCrawler/spiders/Wikipedia.py similarity index 100% rename from Fourmi/spiders/Wikipedia.py rename to FourmiCrawler/spiders/Wikipedia.py diff --git a/Fourmi/spiders/__init__.py b/FourmiCrawler/spiders/__init__.py similarity index 100% rename from Fourmi/spiders/__init__.py rename to FourmiCrawler/spiders/__init__.py diff --git a/scrapy.cfg b/scrapy.cfg index 6f432fb..2226c7c 100644 --- a/scrapy.cfg +++ b/scrapy.cfg @@ -4,7 +4,7 @@ # http://doc.scrapy.org/en/latest/topics/scrapyd.html [settings] -default = Scrapy.settings +default = FourmiCrawler.settings [deploy] #url = http://localhost:6800/ From 7355de1b20b9444879f743e20f03533ed19f192b Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Tue, 18 Mar 2014 18:03:22 +0100 Subject: [PATCH 5/7] Added an simple script to run a spider --- Fourmi.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 Fourmi.py diff --git a/Fourmi.py b/Fourmi.py new file mode 100644 index 0000000..4ed2c95 --- /dev/null +++ b/Fourmi.py @@ -0,0 +1,21 @@ +""" +Fourmi - An internet webcrawler searching for information on chemical compounds. +[todo] - Add some more useful text here. +""" + +from twisted.internet import reactor +from scrapy.crawler import Crawler +from scrapy import log, signals +from FourmiCrawler.spiders.Chemspider import ChemspiderSpider # [review] - There should be an easy way to import all spiders! +from scrapy.utils.project import get_project_settings + +# [todo] - Add something to add all spiders, with the right references +spider = ChemspiderSpider(compound = "Aspirin") +settings = get_project_settings() +crawler = Crawler(settings) +crawler.signals.connect(reactor.stop, signal=signals.spider_closed) +crawler.configure() +crawler.crawl(spider) +crawler.start() +log.start() +reactor.run() \ No newline at end of file From 826937e25e366fac55e478740eb6b55b8a990c6e Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Tue, 18 Mar 2014 18:05:44 +0100 Subject: [PATCH 6/7] Unix machine should be able to execute this without any problems. --- Fourmi.py | 1 + 1 file changed, 1 insertion(+) diff --git a/Fourmi.py b/Fourmi.py index 4ed2c95..16029f9 100644 --- a/Fourmi.py +++ b/Fourmi.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python """ Fourmi - An internet webcrawler searching for information on chemical compounds. [todo] - Add some more useful text here. From 328cb3808c28237eb5f56713fe827a2d0807e166 Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Tue, 18 Mar 2014 18:05:44 +0100 Subject: [PATCH 7/7] Unix machine should be able to execute this without any problems. --- Fourmi.py | 1 + 1 file changed, 1 insertion(+) mode change 100644 => 100755 Fourmi.py diff --git a/Fourmi.py b/Fourmi.py old mode 100644 new mode 100755 index 4ed2c95..16029f9 --- a/Fourmi.py +++ b/Fourmi.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python """ Fourmi - An internet webcrawler searching for information on chemical compounds. [todo] - Add some more useful text here.