New Structure, splitting on parsers instead of Spiders
This commit is contained in:
parent
306a37db1a
commit
8175e02f6c
11
Fourmi.py
11
Fourmi.py
@ -7,13 +7,11 @@ Fourmi - An internet webcrawler searching for information on chemical compounds.
|
||||
from twisted.internet import reactor
|
||||
from scrapy.crawler import Crawler
|
||||
from scrapy import log, signals
|
||||
from FourmiCrawler.spiders.Chemspider import ChemspiderSpider # [review] - There should be an easy way to import all spiders!
|
||||
from FourmiCrawler.spiders.Fourmispider import FourmiSpider
|
||||
from scrapy.utils.project import get_project_settings
|
||||
|
||||
defined_spiders = [ChemspiderSpider(compound = "Methane")]
|
||||
|
||||
def setup_crawler(Spider, compound):
|
||||
spider = FollowAllSpider(domain=domain) # [todo] - Do something smart to get the different spiders to work here.
|
||||
def setup_crawler(compound):
|
||||
spider = FourmiSpider(domain=domain) # [todo] - Do something smart to get the different spiders to work here.
|
||||
settings = get_project_settings()
|
||||
crawler = Crawler(settings)
|
||||
crawler.configure()
|
||||
@ -21,8 +19,7 @@ def setup_crawler(Spider, compound):
|
||||
crawler.start()
|
||||
|
||||
def start():
|
||||
for spider in defined_spiders:
|
||||
setup_crawler(spider, compound)
|
||||
setup_crawler(compound)
|
||||
log.start()
|
||||
reactor.run()
|
||||
|
||||
|
@ -1,12 +0,0 @@
|
||||
from scrapy.spider import Spider
|
||||
|
||||
class ChemspiderSpider(Spider):
|
||||
name = "Chemspider"
|
||||
allowed_domains = ["chemspider.com"]
|
||||
|
||||
def __init__(self, compound=None, *args, **kwargs):
|
||||
super(ChemspiderSpider, self).__init__(*args, **kwargs)
|
||||
self.start_urls = ["http://chemspiderapiurl/something/%s" % compound] #[TODO] - Give an logical start url.
|
||||
|
||||
def parse(self, response):
|
||||
pass
|
12
FourmiCrawler/spiders/Fourmispider.py
Normal file
12
FourmiCrawler/spiders/Fourmispider.py
Normal file
@ -0,0 +1,12 @@
|
||||
from scrapy.spider import Spider
|
||||
|
||||
class FourmiSpider(Spider):
|
||||
name="FourmiSpider"
|
||||
|
||||
def __init__(self, compound=None, *args, **kwargs):
|
||||
super(FourmiSpider, self).__init__(*args, **kwargs)
|
||||
# [TODO] - Initiate all parsers for the different websites and get allowed URLs.
|
||||
|
||||
def parse(self, reponse):
|
||||
# [TODO] - This function should delegate it's functionality to other parsers.
|
||||
pass
|
@ -1,12 +0,0 @@
|
||||
from scrapy.spider import Spider
|
||||
|
||||
class WikipediaSpider(Spider):
|
||||
name = "Wikipedia"
|
||||
allowed_domains = ["wikipedia.org"]
|
||||
|
||||
def __init__(self, compound=None, *args, **kwargs):
|
||||
super(WikipediaSpider, self).__init__(*args, **kwargs)
|
||||
self.start_urls = ["http://wikipediaurl/something/%s" % compound] #[TODO] - Give an logical start url.
|
||||
|
||||
def parse(self, response):
|
||||
pass
|
Reference in New Issue
Block a user