diff --git a/.gitignore b/.gitignore index c1549e0..158ef41 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,9 @@ #EDITOR AND IDE SPECIFIC SETTINGFILES .idea +#Python Specific ignores +*.pyc + #THINGS WE WOULD NEVER EVER WANT! #ignore thumbnails created by windows Thumbs.db diff --git a/Fourmi.py b/Fourmi.py new file mode 100755 index 0000000..f1bf1ba --- /dev/null +++ b/Fourmi.py @@ -0,0 +1,31 @@ +#!/usr/bin/env python +""" +Fourmi - An internet webcrawler searching for information on chemical +compounds. [todo] - Add some more useful text here. +""" + +from twisted.internet import reactor +from scrapy.crawler import Crawler +from scrapy import log, signals +from FourmiCrawler.spiders.Fourmispider import FourmiSpider +from scrapy.utils.project import get_project_settings + + +def setup_crawler(searchable): + # [TODO] - Initiate all parsers for the different websites and get + # allowed URLs. + spider = FourmiSpider(compound=searchable) + settings = get_project_settings() + crawler = Crawler(settings) + crawler.signals.connect(reactor.stop, signal=signals.spider_closed) + crawler.configure() + crawler.crawl(spider) + crawler.start() + + +def start(): + setup_crawler("Methane") + log.start() + reactor.run() + +start() diff --git a/Scrapy/__init__.py b/FourmiCrawler/__init__.py similarity index 100% rename from Scrapy/__init__.py rename to FourmiCrawler/__init__.py diff --git a/Scrapy/items.py b/FourmiCrawler/items.py similarity index 54% rename from Scrapy/items.py rename to FourmiCrawler/items.py index 17b9d3d..c7fd41c 100644 --- a/Scrapy/items.py +++ b/FourmiCrawler/items.py @@ -5,7 +5,10 @@ from scrapy.item import Item, Field -class FourmiItem(Item): - # define the fields for your item here like: - # name = Field() - pass + +class Result(Item): + attribute = Field() + value = Field() + source = Field() + reliability = Field() + conditions = Field() diff --git a/FourmiCrawler/parsers/__init__.py b/FourmiCrawler/parsers/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/FourmiCrawler/parsers/parser.py b/FourmiCrawler/parsers/parser.py new file mode 100644 index 0000000..3362d59 --- /dev/null +++ b/FourmiCrawler/parsers/parser.py @@ -0,0 +1,9 @@ +from scrapy import log + + +class Parser: + website = "http://localhost/*" + + def parse(self, reponse): + log.msg("The parse function of the empty parser was used.", level=log.Warning) + pass diff --git a/FourmiCrawler/pipelines.py b/FourmiCrawler/pipelines.py new file mode 100644 index 0000000..3194d7e --- /dev/null +++ b/FourmiCrawler/pipelines.py @@ -0,0 +1,25 @@ +# Define your item pipelines here +# +# Don't forget to add your pipeline to the ITEM_PIPELINES setting +# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html +from scrapy.exceptions import DropItem + + +class FourmiPipeline(object): + + def __init__(self): + self.known_values = set() + + def process_item(self, item, spider): + """ + Processing the items so exact doubles are dropped + :param item: The incoming item + :param spider: The spider which scraped the spider + :return: :raise DropItem: Returns the item if unique or drops them if it's already known + """ + value = item['attribute'], item['value'] + if value in self.known_values: + raise DropItem("Duplicate item found: %s" % item) + else: + self.known_values.add(value) + return item diff --git a/Scrapy/settings.py b/FourmiCrawler/settings.py similarity index 54% rename from Scrapy/settings.py rename to FourmiCrawler/settings.py index e43aa2b..b025167 100644 --- a/Scrapy/settings.py +++ b/FourmiCrawler/settings.py @@ -6,10 +6,15 @@ # http://doc.scrapy.org/en/latest/topics/settings.html # -BOT_NAME = 'Fourmi' +BOT_NAME = 'FourmiCrawler' -SPIDER_MODULES = ['Scrapy.spiders'] -NEWSPIDER_MODULE = 'Scrapy.spiders' +SPIDER_MODULES = ['FourmiCrawler'] +NEWSPIDER_MODULE = 'FourmiCrawler' +ITEM_PIPELINES = { + 'FourmiCrawler.pipelines.FourmiPipeline': 100 +} -# Crawl responsibly by identifying yourself (and your website) on the user-agent -#USER_AGENT = 'Fourmi (+http://www.yourdomain.com)' +# Crawl responsibly by identifying yourself (and your website) on the +# user-agent + +# USER_AGENT = 'FourmiCrawler (+http://www.yourdomain.com)' diff --git a/FourmiCrawler/spider.py b/FourmiCrawler/spider.py new file mode 100644 index 0000000..4c25df9 --- /dev/null +++ b/FourmiCrawler/spider.py @@ -0,0 +1,16 @@ +from scrapy.spider import Spider + + +class FourmiSpider(Spider): + name = "FourmiSpider" + + def __init__(self, compound=None, *args, **kwargs): + super(FourmiSpider, self).__init__(*args, **kwargs) + + def parse(self, reponse): + # [TODO] - This function should delegate it's functionality to other + # parsers. + pass + + def add_parser(self, parser): + self.parsers.add(parser) diff --git a/Scrapy/__init__.pyc b/Scrapy/__init__.pyc deleted file mode 100644 index f1096fd..0000000 Binary files a/Scrapy/__init__.pyc and /dev/null differ diff --git a/Scrapy/pipelines.py b/Scrapy/pipelines.py deleted file mode 100644 index 3345787..0000000 --- a/Scrapy/pipelines.py +++ /dev/null @@ -1,8 +0,0 @@ -# Define your item pipelines here -# -# Don't forget to add your pipeline to the ITEM_PIPELINES setting -# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html - -class FourmiPipeline(object): - def process_item(self, item, spider): - return item diff --git a/Scrapy/settings.pyc b/Scrapy/settings.pyc deleted file mode 100644 index 828e883..0000000 Binary files a/Scrapy/settings.pyc and /dev/null differ diff --git a/Scrapy/spiders/__init__.py b/Scrapy/spiders/__init__.py deleted file mode 100644 index ebd689a..0000000 --- a/Scrapy/spiders/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# This package will contain the spiders of your Scrapy project -# -# Please refer to the documentation for information on how to create and manage -# your spiders. diff --git a/Scrapy/spiders/__init__.pyc b/Scrapy/spiders/__init__.pyc deleted file mode 100644 index c2fd939..0000000 Binary files a/Scrapy/spiders/__init__.pyc and /dev/null differ diff --git a/scrapy.cfg b/scrapy.cfg index 6f432fb..2226c7c 100644 --- a/scrapy.cfg +++ b/scrapy.cfg @@ -4,7 +4,7 @@ # http://doc.scrapy.org/en/latest/topics/scrapyd.html [settings] -default = Scrapy.settings +default = FourmiCrawler.settings [deploy] #url = http://localhost:6800/