Archived
1
0

Merge branch 'feature/basic-structure' into develop

This commit is contained in:
Jip J. Dekker 2014-03-28 14:46:43 +01:00
commit 6bbee865c4
9 changed files with 55 additions and 46 deletions

View File

@ -1,22 +1,31 @@
#!/usr/bin/env python #!/usr/bin/env python
""" """
Fourmi - An internet webcrawler searching for information on chemical compounds. Fourmi - An internet webcrawler searching for information on chemical
[todo] - Add some more useful text here. compounds. [todo] - Add some more useful text here.
""" """
from twisted.internet import reactor from twisted.internet import reactor
from scrapy.crawler import Crawler from scrapy.crawler import Crawler
from scrapy import log, signals from scrapy import log, signals
from FourmiCrawler.spiders.Chemspider import ChemspiderSpider # [review] - There should be an easy way to import all spiders! from FourmiCrawler.spiders.Fourmispider import FourmiSpider
from scrapy.utils.project import get_project_settings from scrapy.utils.project import get_project_settings
# [todo] - Add something to add all spiders, with the right references
spider = ChemspiderSpider(compound = "Aspirin") def setup_crawler(searchable):
settings = get_project_settings() # [TODO] - Initiate all parsers for the different websites and get
crawler = Crawler(settings) # allowed URLs.
crawler.signals.connect(reactor.stop, signal=signals.spider_closed) spider = FourmiSpider(compound=searchable)
crawler.configure() settings = get_project_settings()
crawler.crawl(spider) crawler = Crawler(settings)
crawler.start() crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
log.start() crawler.configure()
reactor.run() crawler.crawl(spider)
crawler.start()
def start():
setup_crawler("Methane")
log.start()
reactor.run()
start()

View File

@ -5,9 +5,10 @@
from scrapy.item import Item, Field from scrapy.item import Item, Field
class Result(Item): class Result(Item):
attribute = Field() attribute = Field()
value = Field() value = Field()
source = Field() source = Field()
reliability = Field() reliability = Field()
conditions = Field() conditions = Field()

View File

View File

@ -0,0 +1,9 @@
from scrapy import log
class Parser:
website = "http://localhost/*"
def parse(self, reponse):
log.msg("The parse function of the empty parser was used.", level=log.Warning)
pass

View File

@ -8,11 +8,13 @@
BOT_NAME = 'FourmiCrawler' BOT_NAME = 'FourmiCrawler'
SPIDER_MODULES = ['FourmiCrawler.spiders'] SPIDER_MODULES = ['FourmiCrawler']
NEWSPIDER_MODULE = 'FourmiCrawler.spiders' NEWSPIDER_MODULE = 'FourmiCrawler'
ITEM_PIPELINES = { ITEM_PIPELINES = {
'FourmiCrawler.pipelines.FourmiPipeline': 100 'FourmiCrawler.pipelines.FourmiPipeline': 100
} }
# Crawl responsibly by identifying yourself (and your website) on the user-agent # Crawl responsibly by identifying yourself (and your website) on the
#USER_AGENT = 'FourmiCrawler (+http://www.yourdomain.com)' # user-agent
# USER_AGENT = 'FourmiCrawler (+http://www.yourdomain.com)'

16
FourmiCrawler/spider.py Normal file
View File

@ -0,0 +1,16 @@
from scrapy.spider import Spider
class FourmiSpider(Spider):
name = "FourmiSpider"
def __init__(self, compound=None, *args, **kwargs):
super(FourmiSpider, self).__init__(*args, **kwargs)
def parse(self, reponse):
# [TODO] - This function should delegate it's functionality to other
# parsers.
pass
def add_parser(self, parser):
self.parsers.add(parser)

View File

@ -1,12 +0,0 @@
from scrapy.spider import Spider
class ChemspiderSpider(Spider):
name = "Chemspider"
allowed_domains = ["chemspider.com"]
def __init__(self, compound=None, *args, **kwargs):
super(ChemspiderSpider, self).__init__(*args, **kwargs)
self.start_urls = ["http://chemspiderapiurl/something/%s" % compound] #[TODO] - Give an logical start url.
def parse(self, response):
pass

View File

@ -1,12 +0,0 @@
from scrapy.spider import Spider
class WikipediaSpider(Spider):
name = "Wikipedia"
allowed_domains = ["wikipedia.org"]
def __init__(self, compound=None, *args, **kwargs):
super(WikipediaSpider, self).__init__(*args, **kwargs)
self.start_urls = ["http://wikipediaurl/something/%s" % compound] #[TODO] - Give an logical start url.
def parse(self, response):
pass

View File

@ -1,4 +0,0 @@
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.