Archived
1
0

New Structure, splitting on parsers instead of Spiders

This commit is contained in:
Jip J. Dekker 2014-03-27 13:08:46 +01:00
parent 306a37db1a
commit 8175e02f6c
4 changed files with 16 additions and 31 deletions

View File

@ -7,13 +7,11 @@ Fourmi - An internet webcrawler searching for information on chemical compounds.
from twisted.internet import reactor from twisted.internet import reactor
from scrapy.crawler import Crawler from scrapy.crawler import Crawler
from scrapy import log, signals from scrapy import log, signals
from FourmiCrawler.spiders.Chemspider import ChemspiderSpider # [review] - There should be an easy way to import all spiders! from FourmiCrawler.spiders.Fourmispider import FourmiSpider
from scrapy.utils.project import get_project_settings from scrapy.utils.project import get_project_settings
defined_spiders = [ChemspiderSpider(compound = "Methane")] def setup_crawler(compound):
spider = FourmiSpider(domain=domain) # [todo] - Do something smart to get the different spiders to work here.
def setup_crawler(Spider, compound):
spider = FollowAllSpider(domain=domain) # [todo] - Do something smart to get the different spiders to work here.
settings = get_project_settings() settings = get_project_settings()
crawler = Crawler(settings) crawler = Crawler(settings)
crawler.configure() crawler.configure()
@ -21,8 +19,7 @@ def setup_crawler(Spider, compound):
crawler.start() crawler.start()
def start(): def start():
for spider in defined_spiders: setup_crawler(compound)
setup_crawler(spider, compound)
log.start() log.start()
reactor.run() reactor.run()

View File

@ -1,12 +0,0 @@
from scrapy.spider import Spider
class ChemspiderSpider(Spider):
name = "Chemspider"
allowed_domains = ["chemspider.com"]
def __init__(self, compound=None, *args, **kwargs):
super(ChemspiderSpider, self).__init__(*args, **kwargs)
self.start_urls = ["http://chemspiderapiurl/something/%s" % compound] #[TODO] - Give an logical start url.
def parse(self, response):
pass

View File

@ -0,0 +1,12 @@
from scrapy.spider import Spider
class FourmiSpider(Spider):
name="FourmiSpider"
def __init__(self, compound=None, *args, **kwargs):
super(FourmiSpider, self).__init__(*args, **kwargs)
# [TODO] - Initiate all parsers for the different websites and get allowed URLs.
def parse(self, reponse):
# [TODO] - This function should delegate it's functionality to other parsers.
pass

View File

@ -1,12 +0,0 @@
from scrapy.spider import Spider
class WikipediaSpider(Spider):
name = "Wikipedia"
allowed_domains = ["wikipedia.org"]
def __init__(self, compound=None, *args, **kwargs):
super(WikipediaSpider, self).__init__(*args, **kwargs)
self.start_urls = ["http://wikipediaurl/something/%s" % compound] #[TODO] - Give an logical start url.
def parse(self, response):
pass