Archived
1
0

Merge branch 'feature/basic-structure' into develop

This commit is contained in:
Jip J. Dekker 2014-03-18 18:10:03 +01:00
commit aa65bbd459
12 changed files with 79 additions and 35 deletions

22
Fourmi.py Executable file
View File

@ -0,0 +1,22 @@
#!/usr/bin/env python
"""
Fourmi - An internet webcrawler searching for information on chemical compounds.
[todo] - Add some more useful text here.
"""
from twisted.internet import reactor
from scrapy.crawler import Crawler
from scrapy import log, signals
from FourmiCrawler.spiders.Chemspider import ChemspiderSpider # [review] - There should be an easy way to import all spiders!
from scrapy.utils.project import get_project_settings
# [todo] - Add something to add all spiders, with the right references
spider = ChemspiderSpider(compound = "Aspirin")
settings = get_project_settings()
crawler = Crawler(settings)
crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
crawler.configure()
crawler.crawl(spider)
crawler.start()
log.start()
reactor.run()

View File

@ -0,0 +1,25 @@
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
from scrapy.exceptions import DropItem
class FourmiPipeline(object):
def __init__(self):
self.known_values = set()
def process_item(self, item, spider):
"""
Processing the items so exact doubles are dropped
:param item: The incoming item
:param spider: The spider which scraped the spider
:return: :raise DropItem: Returns the item if unique or drops them if it's already known
"""
value = item['attribute'], item['value']
if value in self.known_values:
raise DropItem("Duplicate item found: %s" % item)
else:
self.known_values.add(value)
return item

View File

@ -6,10 +6,13 @@
# http://doc.scrapy.org/en/latest/topics/settings.html
#
BOT_NAME = 'Fourmi'
BOT_NAME = 'FourmiCrawler'
SPIDER_MODULES = ['Scrapy.spiders']
NEWSPIDER_MODULE = 'Scrapy.spiders'
SPIDER_MODULES = ['FourmiCrawler.spiders']
NEWSPIDER_MODULE = 'FourmiCrawler.spiders'
ITEM_PIPELINES = {
'FourmiCrawler.pipelines.FourmiPipeline': 100
}
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'Fourmi (+http://www.yourdomain.com)'
#USER_AGENT = 'FourmiCrawler (+http://www.yourdomain.com)'

View File

@ -0,0 +1,12 @@
from scrapy.spider import Spider
class ChemspiderSpider(Spider):
name = "Chemspider"
allowed_domains = ["chemspider.com"]
def __init__(self, compound=None, *args, **kwargs):
super(ChemspiderSpider, self).__init__(*args, **kwargs)
self.start_urls = ["http://chemspiderapiurl/something/%s" % compound] #[TODO] - Give an logical start url.
def parse(self, response):
pass

View File

@ -0,0 +1,12 @@
from scrapy.spider import Spider
class WikipediaSpider(Spider):
name = "Wikipedia"
allowed_domains = ["wikipedia.org"]
def __init__(self, compound=None, *args, **kwargs):
super(WikipediaSpider, self).__init__(*args, **kwargs)
self.start_urls = ["http://wikipediaurl/something/%s" % compound] #[TODO] - Give an logical start url.
def parse(self, response):
pass

View File

@ -1,8 +0,0 @@
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
class FourmiPipeline(object):
def process_item(self, item, spider):
return item

View File

@ -1,11 +0,0 @@
from scrapy.spider import Spider
class ChemspiderSpider(Spider):
name = "Chemspider"
allowed_domains = ["chemspider.com"]
start_urls = (
'http://www.chemspider.com/',
)
def parse(self, response):
pass

View File

@ -1,11 +0,0 @@
from scrapy.spider import Spider
class WikipediaSpider(Spider):
name = "Wikipedia"
allowed_domains = ["wikipedia.org"]
start_urls = (
'http://www.wikipedia.org/',
)
def parse(self, response):
pass

View File

@ -4,7 +4,7 @@
# http://doc.scrapy.org/en/latest/topics/scrapyd.html
[settings]
default = Scrapy.settings
default = FourmiCrawler.settings
[deploy]
#url = http://localhost:6800/