From 6182d4104eef9a68c6958d3320656d7c6ff77900 Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Sun, 16 Mar 2014 22:54:34 +0100 Subject: [PATCH 01/20] Added an result item which the spiders will return. --- Scrapy/items.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/Scrapy/items.py b/Scrapy/items.py index 17b9d3d..5fedc36 100644 --- a/Scrapy/items.py +++ b/Scrapy/items.py @@ -5,7 +5,9 @@ from scrapy.item import Item, Field -class FourmiItem(Item): - # define the fields for your item here like: - # name = Field() - pass +class Result(Item): + attribute = Field() + value = Field() + source = Field() + reliability = Field() + conditions = Field() \ No newline at end of file From 35481128388b8537d9505bf59adb790410a8a3a3 Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Sun, 16 Mar 2014 23:11:30 +0100 Subject: [PATCH 02/20] Removed all .pyc files and added to the ignore list --- .gitignore | 3 +++ Scrapy/__init__.pyc | Bin 143 -> 0 bytes Scrapy/settings.pyc | Bin 251 -> 0 bytes Scrapy/spiders/__init__.pyc | Bin 151 -> 0 bytes 4 files changed, 3 insertions(+) delete mode 100644 Scrapy/__init__.pyc delete mode 100644 Scrapy/settings.pyc delete mode 100644 Scrapy/spiders/__init__.pyc diff --git a/.gitignore b/.gitignore index c1549e0..158ef41 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,9 @@ #EDITOR AND IDE SPECIFIC SETTINGFILES .idea +#Python Specific ignores +*.pyc + #THINGS WE WOULD NEVER EVER WANT! #ignore thumbnails created by windows Thumbs.db diff --git a/Scrapy/__init__.pyc b/Scrapy/__init__.pyc deleted file mode 100644 index f1096fd5de3c353baeeef45805c9e5c90c6fb80c..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 143 zcmZSn%*%DEQaU)90SXv_v;z^JG&>$#_wg|jPxQklpinAU(^`rX(jTzLyeDh}B z9G>~=b;r*U56QPE&IL9M5rPmONl8SBPix^DL>PqICc4@}tW{a03G`9&u(;;c1oA}4 z0=(v@jmBz!3cTgc)t|4!#&M6qZttA8F4qI}J@>-@uQuMu4=}}*#VVETCglAt3+AY> qOl78l9V#8Gj5RS>6M=!$I>{oK-J diff --git a/Scrapy/spiders/__init__.pyc b/Scrapy/spiders/__init__.pyc deleted file mode 100644 index c2fd93959faa39176f3f6aca2ae6dd4d68149d9b..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 151 zcmZSn%*z#8Cmo#300oRd+5w1*S%5?e14FO|NW@PANHCxg#STC*{fzwFRQ;@!)a>lk zB7K+qbP{rLFIyv&mLc)fzk W5)PmtHo5sJr8%i~AghXjm;nG#4I(uF From 8dd2c168d2d915f8487dc1176198845523bf2a01 Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Sun, 16 Mar 2014 23:14:59 +0100 Subject: [PATCH 03/20] Added the basic structure for the first two spiders --- Scrapy/spiders/Chemspider.py | 11 +++++++++++ Scrapy/spiders/Wikipedia.py | 11 +++++++++++ 2 files changed, 22 insertions(+) create mode 100644 Scrapy/spiders/Chemspider.py create mode 100644 Scrapy/spiders/Wikipedia.py diff --git a/Scrapy/spiders/Chemspider.py b/Scrapy/spiders/Chemspider.py new file mode 100644 index 0000000..3fc74a0 --- /dev/null +++ b/Scrapy/spiders/Chemspider.py @@ -0,0 +1,11 @@ +from scrapy.spider import Spider + +class ChemspiderSpider(Spider): + name = "Chemspider" + allowed_domains = ["chemspider.com"] + start_urls = ( + 'http://www.chemspider.com/', + ) + + def parse(self, response): + pass diff --git a/Scrapy/spiders/Wikipedia.py b/Scrapy/spiders/Wikipedia.py new file mode 100644 index 0000000..03b202b --- /dev/null +++ b/Scrapy/spiders/Wikipedia.py @@ -0,0 +1,11 @@ +from scrapy.spider import Spider + +class WikipediaSpider(Spider): + name = "Wikipedia" + allowed_domains = ["wikipedia.org"] + start_urls = ( + 'http://www.wikipedia.org/', + ) + + def parse(self, response): + pass From 55843d320c54b7c7a39c398170d657d07ee80c71 Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Mon, 17 Mar 2014 16:25:48 +0100 Subject: [PATCH 04/20] Added an formal pipeline to make sure that we don't supply double values. --- Scrapy/pipelines.py | 19 ++++++++++++++++++- Scrapy/settings.py | 3 +++ 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/Scrapy/pipelines.py b/Scrapy/pipelines.py index 3345787..3194d7e 100644 --- a/Scrapy/pipelines.py +++ b/Scrapy/pipelines.py @@ -2,7 +2,24 @@ # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html +from scrapy.exceptions import DropItem + class FourmiPipeline(object): + + def __init__(self): + self.known_values = set() + def process_item(self, item, spider): - return item + """ + Processing the items so exact doubles are dropped + :param item: The incoming item + :param spider: The spider which scraped the spider + :return: :raise DropItem: Returns the item if unique or drops them if it's already known + """ + value = item['attribute'], item['value'] + if value in self.known_values: + raise DropItem("Duplicate item found: %s" % item) + else: + self.known_values.add(value) + return item diff --git a/Scrapy/settings.py b/Scrapy/settings.py index e43aa2b..fd379a9 100644 --- a/Scrapy/settings.py +++ b/Scrapy/settings.py @@ -10,6 +10,9 @@ BOT_NAME = 'Fourmi' SPIDER_MODULES = ['Scrapy.spiders'] NEWSPIDER_MODULE = 'Scrapy.spiders' +ITEM_PIPELINES = { + 'Scrapy.pipelines.FourmiPipeline': 100 +} # Crawl responsibly by identifying yourself (and your website) on the user-agent #USER_AGENT = 'Fourmi (+http://www.yourdomain.com)' From 2cb21c6b810624cf5569f95828b326fc1ccc1996 Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Mon, 17 Mar 2014 16:38:13 +0100 Subject: [PATCH 05/20] Moved the Scrapy to an other namespace, should prevent some importing faults. --- {Scrapy => Fourmi}/__init__.py | 0 {Scrapy => Fourmi}/items.py | 0 {Scrapy => Fourmi}/pipelines.py | 0 {Scrapy => Fourmi}/settings.py | 6 +++--- {Scrapy => Fourmi}/spiders/Chemspider.py | 0 {Scrapy => Fourmi}/spiders/Wikipedia.py | 0 {Scrapy => Fourmi}/spiders/__init__.py | 0 7 files changed, 3 insertions(+), 3 deletions(-) rename {Scrapy => Fourmi}/__init__.py (100%) rename {Scrapy => Fourmi}/items.py (100%) rename {Scrapy => Fourmi}/pipelines.py (100%) rename {Scrapy => Fourmi}/settings.py (77%) rename {Scrapy => Fourmi}/spiders/Chemspider.py (100%) rename {Scrapy => Fourmi}/spiders/Wikipedia.py (100%) rename {Scrapy => Fourmi}/spiders/__init__.py (100%) diff --git a/Scrapy/__init__.py b/Fourmi/__init__.py similarity index 100% rename from Scrapy/__init__.py rename to Fourmi/__init__.py diff --git a/Scrapy/items.py b/Fourmi/items.py similarity index 100% rename from Scrapy/items.py rename to Fourmi/items.py diff --git a/Scrapy/pipelines.py b/Fourmi/pipelines.py similarity index 100% rename from Scrapy/pipelines.py rename to Fourmi/pipelines.py diff --git a/Scrapy/settings.py b/Fourmi/settings.py similarity index 77% rename from Scrapy/settings.py rename to Fourmi/settings.py index fd379a9..a24e6f6 100644 --- a/Scrapy/settings.py +++ b/Fourmi/settings.py @@ -8,10 +8,10 @@ BOT_NAME = 'Fourmi' -SPIDER_MODULES = ['Scrapy.spiders'] -NEWSPIDER_MODULE = 'Scrapy.spiders' +SPIDER_MODULES = ['Fourmi.spiders'] +NEWSPIDER_MODULE = 'Fourmi.spiders' ITEM_PIPELINES = { - 'Scrapy.pipelines.FourmiPipeline': 100 + 'Fourmi.pipelines.FourmiPipeline': 100 } # Crawl responsibly by identifying yourself (and your website) on the user-agent diff --git a/Scrapy/spiders/Chemspider.py b/Fourmi/spiders/Chemspider.py similarity index 100% rename from Scrapy/spiders/Chemspider.py rename to Fourmi/spiders/Chemspider.py diff --git a/Scrapy/spiders/Wikipedia.py b/Fourmi/spiders/Wikipedia.py similarity index 100% rename from Scrapy/spiders/Wikipedia.py rename to Fourmi/spiders/Wikipedia.py diff --git a/Scrapy/spiders/__init__.py b/Fourmi/spiders/__init__.py similarity index 100% rename from Scrapy/spiders/__init__.py rename to Fourmi/spiders/__init__.py From 4f5b66fff69b68666d4e114afa2d53382476b985 Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Tue, 18 Mar 2014 17:28:49 +0100 Subject: [PATCH 06/20] Basic structure to make sure the spider use an argument --- Fourmi/spiders/Chemspider.py | 7 ++++--- Fourmi/spiders/Wikipedia.py | 7 ++++--- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/Fourmi/spiders/Chemspider.py b/Fourmi/spiders/Chemspider.py index 3fc74a0..b85b44d 100644 --- a/Fourmi/spiders/Chemspider.py +++ b/Fourmi/spiders/Chemspider.py @@ -3,9 +3,10 @@ from scrapy.spider import Spider class ChemspiderSpider(Spider): name = "Chemspider" allowed_domains = ["chemspider.com"] - start_urls = ( - 'http://www.chemspider.com/', - ) + + def __init__(self, compound=None, *args, **kwargs): + super(ChemspiderSpider, self).__init__(*args, **kwargs) + self.start_urls = ["http://chemspiderapiurl/something/%s" % compound] #[TODO] - Give an logical start url. def parse(self, response): pass diff --git a/Fourmi/spiders/Wikipedia.py b/Fourmi/spiders/Wikipedia.py index 03b202b..62ed026 100644 --- a/Fourmi/spiders/Wikipedia.py +++ b/Fourmi/spiders/Wikipedia.py @@ -3,9 +3,10 @@ from scrapy.spider import Spider class WikipediaSpider(Spider): name = "Wikipedia" allowed_domains = ["wikipedia.org"] - start_urls = ( - 'http://www.wikipedia.org/', - ) + + def __init__(self, compound=None, *args, **kwargs): + super(WikipediaSpider, self).__init__(*args, **kwargs) + self.start_urls = ["http://wikipediaurl/something/%s" % compound] #[TODO] - Give an logical start url. def parse(self, response): pass From b1840d3a658918281e3bfcd831c375efcb625841 Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Tue, 18 Mar 2014 17:41:40 +0100 Subject: [PATCH 07/20] Another name change to accommodate an executable script --- {Fourmi => FourmiCrawler}/__init__.py | 0 {Fourmi => FourmiCrawler}/items.py | 0 {Fourmi => FourmiCrawler}/pipelines.py | 0 {Fourmi => FourmiCrawler}/settings.py | 10 +++++----- {Fourmi => FourmiCrawler}/spiders/Chemspider.py | 0 {Fourmi => FourmiCrawler}/spiders/Wikipedia.py | 0 {Fourmi => FourmiCrawler}/spiders/__init__.py | 0 scrapy.cfg | 2 +- 8 files changed, 6 insertions(+), 6 deletions(-) rename {Fourmi => FourmiCrawler}/__init__.py (100%) rename {Fourmi => FourmiCrawler}/items.py (100%) rename {Fourmi => FourmiCrawler}/pipelines.py (100%) rename {Fourmi => FourmiCrawler}/settings.py (60%) rename {Fourmi => FourmiCrawler}/spiders/Chemspider.py (100%) rename {Fourmi => FourmiCrawler}/spiders/Wikipedia.py (100%) rename {Fourmi => FourmiCrawler}/spiders/__init__.py (100%) diff --git a/Fourmi/__init__.py b/FourmiCrawler/__init__.py similarity index 100% rename from Fourmi/__init__.py rename to FourmiCrawler/__init__.py diff --git a/Fourmi/items.py b/FourmiCrawler/items.py similarity index 100% rename from Fourmi/items.py rename to FourmiCrawler/items.py diff --git a/Fourmi/pipelines.py b/FourmiCrawler/pipelines.py similarity index 100% rename from Fourmi/pipelines.py rename to FourmiCrawler/pipelines.py diff --git a/Fourmi/settings.py b/FourmiCrawler/settings.py similarity index 60% rename from Fourmi/settings.py rename to FourmiCrawler/settings.py index a24e6f6..0f5eae8 100644 --- a/Fourmi/settings.py +++ b/FourmiCrawler/settings.py @@ -6,13 +6,13 @@ # http://doc.scrapy.org/en/latest/topics/settings.html # -BOT_NAME = 'Fourmi' +BOT_NAME = 'FourmiCrawler' -SPIDER_MODULES = ['Fourmi.spiders'] -NEWSPIDER_MODULE = 'Fourmi.spiders' +SPIDER_MODULES = ['FourmiCrawler.spiders'] +NEWSPIDER_MODULE = 'FourmiCrawler.spiders' ITEM_PIPELINES = { - 'Fourmi.pipelines.FourmiPipeline': 100 + 'FourmiCrawler.pipelines.FourmiPipeline': 100 } # Crawl responsibly by identifying yourself (and your website) on the user-agent -#USER_AGENT = 'Fourmi (+http://www.yourdomain.com)' +#USER_AGENT = 'FourmiCrawler (+http://www.yourdomain.com)' diff --git a/Fourmi/spiders/Chemspider.py b/FourmiCrawler/spiders/Chemspider.py similarity index 100% rename from Fourmi/spiders/Chemspider.py rename to FourmiCrawler/spiders/Chemspider.py diff --git a/Fourmi/spiders/Wikipedia.py b/FourmiCrawler/spiders/Wikipedia.py similarity index 100% rename from Fourmi/spiders/Wikipedia.py rename to FourmiCrawler/spiders/Wikipedia.py diff --git a/Fourmi/spiders/__init__.py b/FourmiCrawler/spiders/__init__.py similarity index 100% rename from Fourmi/spiders/__init__.py rename to FourmiCrawler/spiders/__init__.py diff --git a/scrapy.cfg b/scrapy.cfg index 6f432fb..2226c7c 100644 --- a/scrapy.cfg +++ b/scrapy.cfg @@ -4,7 +4,7 @@ # http://doc.scrapy.org/en/latest/topics/scrapyd.html [settings] -default = Scrapy.settings +default = FourmiCrawler.settings [deploy] #url = http://localhost:6800/ From 7355de1b20b9444879f743e20f03533ed19f192b Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Tue, 18 Mar 2014 18:03:22 +0100 Subject: [PATCH 08/20] Added an simple script to run a spider --- Fourmi.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 Fourmi.py diff --git a/Fourmi.py b/Fourmi.py new file mode 100644 index 0000000..4ed2c95 --- /dev/null +++ b/Fourmi.py @@ -0,0 +1,21 @@ +""" +Fourmi - An internet webcrawler searching for information on chemical compounds. +[todo] - Add some more useful text here. +""" + +from twisted.internet import reactor +from scrapy.crawler import Crawler +from scrapy import log, signals +from FourmiCrawler.spiders.Chemspider import ChemspiderSpider # [review] - There should be an easy way to import all spiders! +from scrapy.utils.project import get_project_settings + +# [todo] - Add something to add all spiders, with the right references +spider = ChemspiderSpider(compound = "Aspirin") +settings = get_project_settings() +crawler = Crawler(settings) +crawler.signals.connect(reactor.stop, signal=signals.spider_closed) +crawler.configure() +crawler.crawl(spider) +crawler.start() +log.start() +reactor.run() \ No newline at end of file From 826937e25e366fac55e478740eb6b55b8a990c6e Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Tue, 18 Mar 2014 18:05:44 +0100 Subject: [PATCH 09/20] Unix machine should be able to execute this without any problems. --- Fourmi.py | 1 + 1 file changed, 1 insertion(+) diff --git a/Fourmi.py b/Fourmi.py index 4ed2c95..16029f9 100644 --- a/Fourmi.py +++ b/Fourmi.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python """ Fourmi - An internet webcrawler searching for information on chemical compounds. [todo] - Add some more useful text here. From 328cb3808c28237eb5f56713fe827a2d0807e166 Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Tue, 18 Mar 2014 18:05:44 +0100 Subject: [PATCH 10/20] Unix machine should be able to execute this without any problems. --- Fourmi.py | 1 + 1 file changed, 1 insertion(+) mode change 100644 => 100755 Fourmi.py diff --git a/Fourmi.py b/Fourmi.py old mode 100644 new mode 100755 index 4ed2c95..16029f9 --- a/Fourmi.py +++ b/Fourmi.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python """ Fourmi - An internet webcrawler searching for information on chemical compounds. [todo] - Add some more useful text here. From 306a37db1a9be535a0d624b8bf5e1004f218f43c Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Sat, 22 Mar 2014 15:48:08 +0100 Subject: [PATCH 11/20] A better structure which is able to start multiple spiders. --- Fourmi.py | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/Fourmi.py b/Fourmi.py index 16029f9..640f9f7 100755 --- a/Fourmi.py +++ b/Fourmi.py @@ -10,13 +10,20 @@ from scrapy import log, signals from FourmiCrawler.spiders.Chemspider import ChemspiderSpider # [review] - There should be an easy way to import all spiders! from scrapy.utils.project import get_project_settings -# [todo] - Add something to add all spiders, with the right references -spider = ChemspiderSpider(compound = "Aspirin") -settings = get_project_settings() -crawler = Crawler(settings) -crawler.signals.connect(reactor.stop, signal=signals.spider_closed) -crawler.configure() -crawler.crawl(spider) -crawler.start() -log.start() -reactor.run() \ No newline at end of file +defined_spiders = [ChemspiderSpider(compound = "Methane")] + +def setup_crawler(Spider, compound): + spider = FollowAllSpider(domain=domain) # [todo] - Do something smart to get the different spiders to work here. + settings = get_project_settings() + crawler = Crawler(settings) + crawler.configure() + crawler.crawl(spider) + crawler.start() + +def start(): + for spider in defined_spiders: + setup_crawler(spider, compound) + log.start() + reactor.run() + +start() From 8175e02f6c54e7b92f35f794a52655963e077e3a Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Thu, 27 Mar 2014 13:08:46 +0100 Subject: [PATCH 12/20] New Structure, splitting on parsers instead of Spiders --- Fourmi.py | 11 ++++------- FourmiCrawler/spiders/Chemspider.py | 12 ------------ FourmiCrawler/spiders/Fourmispider.py | 12 ++++++++++++ FourmiCrawler/spiders/Wikipedia.py | 12 ------------ 4 files changed, 16 insertions(+), 31 deletions(-) delete mode 100644 FourmiCrawler/spiders/Chemspider.py create mode 100644 FourmiCrawler/spiders/Fourmispider.py delete mode 100644 FourmiCrawler/spiders/Wikipedia.py diff --git a/Fourmi.py b/Fourmi.py index 640f9f7..a0a9ead 100755 --- a/Fourmi.py +++ b/Fourmi.py @@ -7,13 +7,11 @@ Fourmi - An internet webcrawler searching for information on chemical compounds. from twisted.internet import reactor from scrapy.crawler import Crawler from scrapy import log, signals -from FourmiCrawler.spiders.Chemspider import ChemspiderSpider # [review] - There should be an easy way to import all spiders! +from FourmiCrawler.spiders.Fourmispider import FourmiSpider from scrapy.utils.project import get_project_settings -defined_spiders = [ChemspiderSpider(compound = "Methane")] - -def setup_crawler(Spider, compound): - spider = FollowAllSpider(domain=domain) # [todo] - Do something smart to get the different spiders to work here. +def setup_crawler(compound): + spider = FourmiSpider(domain=domain) # [todo] - Do something smart to get the different spiders to work here. settings = get_project_settings() crawler = Crawler(settings) crawler.configure() @@ -21,8 +19,7 @@ def setup_crawler(Spider, compound): crawler.start() def start(): - for spider in defined_spiders: - setup_crawler(spider, compound) + setup_crawler(compound) log.start() reactor.run() diff --git a/FourmiCrawler/spiders/Chemspider.py b/FourmiCrawler/spiders/Chemspider.py deleted file mode 100644 index b85b44d..0000000 --- a/FourmiCrawler/spiders/Chemspider.py +++ /dev/null @@ -1,12 +0,0 @@ -from scrapy.spider import Spider - -class ChemspiderSpider(Spider): - name = "Chemspider" - allowed_domains = ["chemspider.com"] - - def __init__(self, compound=None, *args, **kwargs): - super(ChemspiderSpider, self).__init__(*args, **kwargs) - self.start_urls = ["http://chemspiderapiurl/something/%s" % compound] #[TODO] - Give an logical start url. - - def parse(self, response): - pass diff --git a/FourmiCrawler/spiders/Fourmispider.py b/FourmiCrawler/spiders/Fourmispider.py new file mode 100644 index 0000000..f7b64bd --- /dev/null +++ b/FourmiCrawler/spiders/Fourmispider.py @@ -0,0 +1,12 @@ +from scrapy.spider import Spider + +class FourmiSpider(Spider): + name="FourmiSpider" + + def __init__(self, compound=None, *args, **kwargs): + super(FourmiSpider, self).__init__(*args, **kwargs) + # [TODO] - Initiate all parsers for the different websites and get allowed URLs. + + def parse(self, reponse): + # [TODO] - This function should delegate it's functionality to other parsers. + pass diff --git a/FourmiCrawler/spiders/Wikipedia.py b/FourmiCrawler/spiders/Wikipedia.py deleted file mode 100644 index 62ed026..0000000 --- a/FourmiCrawler/spiders/Wikipedia.py +++ /dev/null @@ -1,12 +0,0 @@ -from scrapy.spider import Spider - -class WikipediaSpider(Spider): - name = "Wikipedia" - allowed_domains = ["wikipedia.org"] - - def __init__(self, compound=None, *args, **kwargs): - super(WikipediaSpider, self).__init__(*args, **kwargs) - self.start_urls = ["http://wikipediaurl/something/%s" % compound] #[TODO] - Give an logical start url. - - def parse(self, response): - pass From bdcf359da7c5fdda98dade75e5de908edb4d1f32 Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Thu, 27 Mar 2014 13:12:27 +0100 Subject: [PATCH 13/20] Logical fixes to have some "working" case --- Fourmi.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Fourmi.py b/Fourmi.py index a0a9ead..9bdec24 100755 --- a/Fourmi.py +++ b/Fourmi.py @@ -10,8 +10,8 @@ from scrapy import log, signals from FourmiCrawler.spiders.Fourmispider import FourmiSpider from scrapy.utils.project import get_project_settings -def setup_crawler(compound): - spider = FourmiSpider(domain=domain) # [todo] - Do something smart to get the different spiders to work here. +def setup_crawler(searchable): + spider = FourmiSpider(compound=searchable) # [todo] - Do something smart to get the different spiders to work here. settings = get_project_settings() crawler = Crawler(settings) crawler.configure() @@ -19,7 +19,7 @@ def setup_crawler(compound): crawler.start() def start(): - setup_crawler(compound) + setup_crawler("Methane") log.start() reactor.run() From 8e9314e753c2390485c96a56364d69dbc0e4f80c Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Thu, 27 Mar 2014 13:18:55 +0100 Subject: [PATCH 14/20] One spider should have it's own folder --- FourmiCrawler/settings.py | 4 ++-- FourmiCrawler/{spiders/Fourmispider.py => spider.py} | 0 FourmiCrawler/spiders/__init__.py | 4 ---- 3 files changed, 2 insertions(+), 6 deletions(-) rename FourmiCrawler/{spiders/Fourmispider.py => spider.py} (100%) delete mode 100644 FourmiCrawler/spiders/__init__.py diff --git a/FourmiCrawler/settings.py b/FourmiCrawler/settings.py index 0f5eae8..28272d0 100644 --- a/FourmiCrawler/settings.py +++ b/FourmiCrawler/settings.py @@ -8,8 +8,8 @@ BOT_NAME = 'FourmiCrawler' -SPIDER_MODULES = ['FourmiCrawler.spiders'] -NEWSPIDER_MODULE = 'FourmiCrawler.spiders' +SPIDER_MODULES = ['FourmiCrawler'] +NEWSPIDER_MODULE = 'FourmiCrawler' ITEM_PIPELINES = { 'FourmiCrawler.pipelines.FourmiPipeline': 100 } diff --git a/FourmiCrawler/spiders/Fourmispider.py b/FourmiCrawler/spider.py similarity index 100% rename from FourmiCrawler/spiders/Fourmispider.py rename to FourmiCrawler/spider.py diff --git a/FourmiCrawler/spiders/__init__.py b/FourmiCrawler/spiders/__init__.py deleted file mode 100644 index ebd689a..0000000 --- a/FourmiCrawler/spiders/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# This package will contain the spiders of your Scrapy project -# -# Please refer to the documentation for information on how to create and manage -# your spiders. From 5b17627504672dc90f0f744d942d2c5c9a055d78 Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Thu, 27 Mar 2014 13:23:03 +0100 Subject: [PATCH 15/20] The parsers however could use their own folder --- FourmiCrawler/parsers/__init__.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 FourmiCrawler/parsers/__init__.py diff --git a/FourmiCrawler/parsers/__init__.py b/FourmiCrawler/parsers/__init__.py new file mode 100644 index 0000000..e69de29 From 87d10415177ceae3485956bc13450891d3f51182 Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Fri, 28 Mar 2014 14:11:36 +0100 Subject: [PATCH 16/20] Made all Python files PEP-8 Compatible --- Fourmi.py | 24 +++++++++++++----------- FourmiCrawler/items.py | 3 ++- FourmiCrawler/settings.py | 6 ++++-- FourmiCrawler/spider.py | 21 ++++++++++++--------- 4 files changed, 31 insertions(+), 23 deletions(-) diff --git a/Fourmi.py b/Fourmi.py index 9bdec24..96c808e 100755 --- a/Fourmi.py +++ b/Fourmi.py @@ -1,7 +1,7 @@ #!/usr/bin/env python """ -Fourmi - An internet webcrawler searching for information on chemical compounds. -[todo] - Add some more useful text here. +Fourmi - An internet webcrawler searching for information on chemical +compounds. [todo] - Add some more useful text here. """ from twisted.internet import reactor @@ -10,17 +10,19 @@ from scrapy import log, signals from FourmiCrawler.spiders.Fourmispider import FourmiSpider from scrapy.utils.project import get_project_settings + def setup_crawler(searchable): - spider = FourmiSpider(compound=searchable) # [todo] - Do something smart to get the different spiders to work here. - settings = get_project_settings() - crawler = Crawler(settings) - crawler.configure() - crawler.crawl(spider) - crawler.start() + spider = FourmiSpider(compound=searchable) + settings = get_project_settings() + crawler = Crawler(settings) + crawler.configure() + crawler.crawl(spider) + crawler.start() + def start(): - setup_crawler("Methane") - log.start() - reactor.run() + setup_crawler("Methane") + log.start() + reactor.run() start() diff --git a/FourmiCrawler/items.py b/FourmiCrawler/items.py index 5fedc36..c7fd41c 100644 --- a/FourmiCrawler/items.py +++ b/FourmiCrawler/items.py @@ -5,9 +5,10 @@ from scrapy.item import Item, Field + class Result(Item): attribute = Field() value = Field() source = Field() reliability = Field() - conditions = Field() \ No newline at end of file + conditions = Field() diff --git a/FourmiCrawler/settings.py b/FourmiCrawler/settings.py index 28272d0..b025167 100644 --- a/FourmiCrawler/settings.py +++ b/FourmiCrawler/settings.py @@ -14,5 +14,7 @@ ITEM_PIPELINES = { 'FourmiCrawler.pipelines.FourmiPipeline': 100 } -# Crawl responsibly by identifying yourself (and your website) on the user-agent -#USER_AGENT = 'FourmiCrawler (+http://www.yourdomain.com)' +# Crawl responsibly by identifying yourself (and your website) on the +# user-agent + +# USER_AGENT = 'FourmiCrawler (+http://www.yourdomain.com)' diff --git a/FourmiCrawler/spider.py b/FourmiCrawler/spider.py index f7b64bd..2805c8e 100644 --- a/FourmiCrawler/spider.py +++ b/FourmiCrawler/spider.py @@ -1,12 +1,15 @@ from scrapy.spider import Spider -class FourmiSpider(Spider): - name="FourmiSpider" - def __init__(self, compound=None, *args, **kwargs): - super(FourmiSpider, self).__init__(*args, **kwargs) - # [TODO] - Initiate all parsers for the different websites and get allowed URLs. - - def parse(self, reponse): - # [TODO] - This function should delegate it's functionality to other parsers. - pass +class FourmiSpider(Spider): + name = "FourmiSpider" + + def __init__(self, compound=None, *args, **kwargs): + super(FourmiSpider, self).__init__(*args, **kwargs) + # [TODO] - Initiate all parsers for the different websites and get + # allowed URLs. + + def parse(self, reponse): + # [TODO] - This function should delegate it's functionality to other + # parsers. + pass From d91706d6e52892ac1b8bccbe74792bdcf9255ffe Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Fri, 28 Mar 2014 14:14:39 +0100 Subject: [PATCH 17/20] The script should stop sometime, added a stopping signal --- Fourmi.py | 1 + 1 file changed, 1 insertion(+) diff --git a/Fourmi.py b/Fourmi.py index 96c808e..533240e 100755 --- a/Fourmi.py +++ b/Fourmi.py @@ -15,6 +15,7 @@ def setup_crawler(searchable): spider = FourmiSpider(compound=searchable) settings = get_project_settings() crawler = Crawler(settings) + crawler.signals.connect(reactor.stop, signal=signals.spider_closed) crawler.configure() crawler.crawl(spider) crawler.start() From 325febe834feaff06f9ceab4462fab17720902ce Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Fri, 28 Mar 2014 14:43:22 +0100 Subject: [PATCH 18/20] Added an basic parser class to extend, next step implementing the global function --- Fourmi.py | 2 ++ FourmiCrawler/spider.py | 5 +++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/Fourmi.py b/Fourmi.py index 533240e..f1bf1ba 100755 --- a/Fourmi.py +++ b/Fourmi.py @@ -12,6 +12,8 @@ from scrapy.utils.project import get_project_settings def setup_crawler(searchable): + # [TODO] - Initiate all parsers for the different websites and get + # allowed URLs. spider = FourmiSpider(compound=searchable) settings = get_project_settings() crawler = Crawler(settings) diff --git a/FourmiCrawler/spider.py b/FourmiCrawler/spider.py index 2805c8e..4c25df9 100644 --- a/FourmiCrawler/spider.py +++ b/FourmiCrawler/spider.py @@ -6,10 +6,11 @@ class FourmiSpider(Spider): def __init__(self, compound=None, *args, **kwargs): super(FourmiSpider, self).__init__(*args, **kwargs) - # [TODO] - Initiate all parsers for the different websites and get - # allowed URLs. def parse(self, reponse): # [TODO] - This function should delegate it's functionality to other # parsers. pass + + def add_parser(self, parser): + self.parsers.add(parser) From 32cedecf2e3ebab0c965457d003a5293f1115d91 Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Fri, 28 Mar 2014 14:43:22 +0100 Subject: [PATCH 19/20] Added an basic parser class to extend, next step implementing the global function --- Fourmi.py | 2 ++ FourmiCrawler/parsers/parser.py | 9 +++++++++ FourmiCrawler/spider.py | 5 +++-- 3 files changed, 14 insertions(+), 2 deletions(-) create mode 100644 FourmiCrawler/parsers/parser.py diff --git a/Fourmi.py b/Fourmi.py index 533240e..f1bf1ba 100755 --- a/Fourmi.py +++ b/Fourmi.py @@ -12,6 +12,8 @@ from scrapy.utils.project import get_project_settings def setup_crawler(searchable): + # [TODO] - Initiate all parsers for the different websites and get + # allowed URLs. spider = FourmiSpider(compound=searchable) settings = get_project_settings() crawler = Crawler(settings) diff --git a/FourmiCrawler/parsers/parser.py b/FourmiCrawler/parsers/parser.py new file mode 100644 index 0000000..3362d59 --- /dev/null +++ b/FourmiCrawler/parsers/parser.py @@ -0,0 +1,9 @@ +from scrapy import log + + +class Parser: + website = "http://localhost/*" + + def parse(self, reponse): + log.msg("The parse function of the empty parser was used.", level=log.Warning) + pass diff --git a/FourmiCrawler/spider.py b/FourmiCrawler/spider.py index 2805c8e..4c25df9 100644 --- a/FourmiCrawler/spider.py +++ b/FourmiCrawler/spider.py @@ -6,10 +6,11 @@ class FourmiSpider(Spider): def __init__(self, compound=None, *args, **kwargs): super(FourmiSpider, self).__init__(*args, **kwargs) - # [TODO] - Initiate all parsers for the different websites and get - # allowed URLs. def parse(self, reponse): # [TODO] - This function should delegate it's functionality to other # parsers. pass + + def add_parser(self, parser): + self.parsers.add(parser) From e210ce85588af22f4408f54776c313b8130f8dc8 Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Sun, 30 Mar 2014 22:08:21 +0200 Subject: [PATCH 20/20] Merge branch 'develop', remote-tracking branch 'origin/develop' into develop