Archived
1
0

Merge branch 'feature/basic-structure' into develop

This commit is contained in:
Jip J. Dekker 2014-03-28 14:46:43 +01:00
commit 6bbee865c4
9 changed files with 55 additions and 46 deletions

View File

@ -1,22 +1,31 @@
#!/usr/bin/env python
"""
Fourmi - An internet webcrawler searching for information on chemical compounds.
[todo] - Add some more useful text here.
Fourmi - An internet webcrawler searching for information on chemical
compounds. [todo] - Add some more useful text here.
"""
from twisted.internet import reactor
from scrapy.crawler import Crawler
from scrapy import log, signals
from FourmiCrawler.spiders.Chemspider import ChemspiderSpider # [review] - There should be an easy way to import all spiders!
from FourmiCrawler.spiders.Fourmispider import FourmiSpider
from scrapy.utils.project import get_project_settings
# [todo] - Add something to add all spiders, with the right references
spider = ChemspiderSpider(compound = "Aspirin")
settings = get_project_settings()
crawler = Crawler(settings)
crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
crawler.configure()
crawler.crawl(spider)
crawler.start()
log.start()
reactor.run()
def setup_crawler(searchable):
# [TODO] - Initiate all parsers for the different websites and get
# allowed URLs.
spider = FourmiSpider(compound=searchable)
settings = get_project_settings()
crawler = Crawler(settings)
crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
crawler.configure()
crawler.crawl(spider)
crawler.start()
def start():
setup_crawler("Methane")
log.start()
reactor.run()
start()

View File

@ -5,9 +5,10 @@
from scrapy.item import Item, Field
class Result(Item):
attribute = Field()
value = Field()
source = Field()
reliability = Field()
conditions = Field()
conditions = Field()

View File

View File

@ -0,0 +1,9 @@
from scrapy import log
class Parser:
website = "http://localhost/*"
def parse(self, reponse):
log.msg("The parse function of the empty parser was used.", level=log.Warning)
pass

View File

@ -8,11 +8,13 @@
BOT_NAME = 'FourmiCrawler'
SPIDER_MODULES = ['FourmiCrawler.spiders']
NEWSPIDER_MODULE = 'FourmiCrawler.spiders'
SPIDER_MODULES = ['FourmiCrawler']
NEWSPIDER_MODULE = 'FourmiCrawler'
ITEM_PIPELINES = {
'FourmiCrawler.pipelines.FourmiPipeline': 100
}
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'FourmiCrawler (+http://www.yourdomain.com)'
# Crawl responsibly by identifying yourself (and your website) on the
# user-agent
# USER_AGENT = 'FourmiCrawler (+http://www.yourdomain.com)'

16
FourmiCrawler/spider.py Normal file
View File

@ -0,0 +1,16 @@
from scrapy.spider import Spider
class FourmiSpider(Spider):
name = "FourmiSpider"
def __init__(self, compound=None, *args, **kwargs):
super(FourmiSpider, self).__init__(*args, **kwargs)
def parse(self, reponse):
# [TODO] - This function should delegate it's functionality to other
# parsers.
pass
def add_parser(self, parser):
self.parsers.add(parser)

View File

@ -1,12 +0,0 @@
from scrapy.spider import Spider
class ChemspiderSpider(Spider):
name = "Chemspider"
allowed_domains = ["chemspider.com"]
def __init__(self, compound=None, *args, **kwargs):
super(ChemspiderSpider, self).__init__(*args, **kwargs)
self.start_urls = ["http://chemspiderapiurl/something/%s" % compound] #[TODO] - Give an logical start url.
def parse(self, response):
pass

View File

@ -1,12 +0,0 @@
from scrapy.spider import Spider
class WikipediaSpider(Spider):
name = "Wikipedia"
allowed_domains = ["wikipedia.org"]
def __init__(self, compound=None, *args, **kwargs):
super(WikipediaSpider, self).__init__(*args, **kwargs)
self.start_urls = ["http://wikipediaurl/something/%s" % compound] #[TODO] - Give an logical start url.
def parse(self, response):
pass

View File

@ -1,4 +0,0 @@
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.