Merge branch 'feature/basic-structure' into develop
This commit is contained in:
commit
6bbee865c4
35
Fourmi.py
35
Fourmi.py
@ -1,22 +1,31 @@
|
|||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
"""
|
"""
|
||||||
Fourmi - An internet webcrawler searching for information on chemical compounds.
|
Fourmi - An internet webcrawler searching for information on chemical
|
||||||
[todo] - Add some more useful text here.
|
compounds. [todo] - Add some more useful text here.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from twisted.internet import reactor
|
from twisted.internet import reactor
|
||||||
from scrapy.crawler import Crawler
|
from scrapy.crawler import Crawler
|
||||||
from scrapy import log, signals
|
from scrapy import log, signals
|
||||||
from FourmiCrawler.spiders.Chemspider import ChemspiderSpider # [review] - There should be an easy way to import all spiders!
|
from FourmiCrawler.spiders.Fourmispider import FourmiSpider
|
||||||
from scrapy.utils.project import get_project_settings
|
from scrapy.utils.project import get_project_settings
|
||||||
|
|
||||||
# [todo] - Add something to add all spiders, with the right references
|
|
||||||
spider = ChemspiderSpider(compound = "Aspirin")
|
def setup_crawler(searchable):
|
||||||
settings = get_project_settings()
|
# [TODO] - Initiate all parsers for the different websites and get
|
||||||
crawler = Crawler(settings)
|
# allowed URLs.
|
||||||
crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
|
spider = FourmiSpider(compound=searchable)
|
||||||
crawler.configure()
|
settings = get_project_settings()
|
||||||
crawler.crawl(spider)
|
crawler = Crawler(settings)
|
||||||
crawler.start()
|
crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
|
||||||
log.start()
|
crawler.configure()
|
||||||
reactor.run()
|
crawler.crawl(spider)
|
||||||
|
crawler.start()
|
||||||
|
|
||||||
|
|
||||||
|
def start():
|
||||||
|
setup_crawler("Methane")
|
||||||
|
log.start()
|
||||||
|
reactor.run()
|
||||||
|
|
||||||
|
start()
|
||||||
|
@ -5,9 +5,10 @@
|
|||||||
|
|
||||||
from scrapy.item import Item, Field
|
from scrapy.item import Item, Field
|
||||||
|
|
||||||
|
|
||||||
class Result(Item):
|
class Result(Item):
|
||||||
attribute = Field()
|
attribute = Field()
|
||||||
value = Field()
|
value = Field()
|
||||||
source = Field()
|
source = Field()
|
||||||
reliability = Field()
|
reliability = Field()
|
||||||
conditions = Field()
|
conditions = Field()
|
||||||
|
0
FourmiCrawler/parsers/__init__.py
Normal file
0
FourmiCrawler/parsers/__init__.py
Normal file
9
FourmiCrawler/parsers/parser.py
Normal file
9
FourmiCrawler/parsers/parser.py
Normal file
@ -0,0 +1,9 @@
|
|||||||
|
from scrapy import log
|
||||||
|
|
||||||
|
|
||||||
|
class Parser:
|
||||||
|
website = "http://localhost/*"
|
||||||
|
|
||||||
|
def parse(self, reponse):
|
||||||
|
log.msg("The parse function of the empty parser was used.", level=log.Warning)
|
||||||
|
pass
|
@ -8,11 +8,13 @@
|
|||||||
|
|
||||||
BOT_NAME = 'FourmiCrawler'
|
BOT_NAME = 'FourmiCrawler'
|
||||||
|
|
||||||
SPIDER_MODULES = ['FourmiCrawler.spiders']
|
SPIDER_MODULES = ['FourmiCrawler']
|
||||||
NEWSPIDER_MODULE = 'FourmiCrawler.spiders'
|
NEWSPIDER_MODULE = 'FourmiCrawler'
|
||||||
ITEM_PIPELINES = {
|
ITEM_PIPELINES = {
|
||||||
'FourmiCrawler.pipelines.FourmiPipeline': 100
|
'FourmiCrawler.pipelines.FourmiPipeline': 100
|
||||||
}
|
}
|
||||||
|
|
||||||
# Crawl responsibly by identifying yourself (and your website) on the user-agent
|
# Crawl responsibly by identifying yourself (and your website) on the
|
||||||
#USER_AGENT = 'FourmiCrawler (+http://www.yourdomain.com)'
|
# user-agent
|
||||||
|
|
||||||
|
# USER_AGENT = 'FourmiCrawler (+http://www.yourdomain.com)'
|
||||||
|
16
FourmiCrawler/spider.py
Normal file
16
FourmiCrawler/spider.py
Normal file
@ -0,0 +1,16 @@
|
|||||||
|
from scrapy.spider import Spider
|
||||||
|
|
||||||
|
|
||||||
|
class FourmiSpider(Spider):
|
||||||
|
name = "FourmiSpider"
|
||||||
|
|
||||||
|
def __init__(self, compound=None, *args, **kwargs):
|
||||||
|
super(FourmiSpider, self).__init__(*args, **kwargs)
|
||||||
|
|
||||||
|
def parse(self, reponse):
|
||||||
|
# [TODO] - This function should delegate it's functionality to other
|
||||||
|
# parsers.
|
||||||
|
pass
|
||||||
|
|
||||||
|
def add_parser(self, parser):
|
||||||
|
self.parsers.add(parser)
|
@ -1,12 +0,0 @@
|
|||||||
from scrapy.spider import Spider
|
|
||||||
|
|
||||||
class ChemspiderSpider(Spider):
|
|
||||||
name = "Chemspider"
|
|
||||||
allowed_domains = ["chemspider.com"]
|
|
||||||
|
|
||||||
def __init__(self, compound=None, *args, **kwargs):
|
|
||||||
super(ChemspiderSpider, self).__init__(*args, **kwargs)
|
|
||||||
self.start_urls = ["http://chemspiderapiurl/something/%s" % compound] #[TODO] - Give an logical start url.
|
|
||||||
|
|
||||||
def parse(self, response):
|
|
||||||
pass
|
|
@ -1,12 +0,0 @@
|
|||||||
from scrapy.spider import Spider
|
|
||||||
|
|
||||||
class WikipediaSpider(Spider):
|
|
||||||
name = "Wikipedia"
|
|
||||||
allowed_domains = ["wikipedia.org"]
|
|
||||||
|
|
||||||
def __init__(self, compound=None, *args, **kwargs):
|
|
||||||
super(WikipediaSpider, self).__init__(*args, **kwargs)
|
|
||||||
self.start_urls = ["http://wikipediaurl/something/%s" % compound] #[TODO] - Give an logical start url.
|
|
||||||
|
|
||||||
def parse(self, response):
|
|
||||||
pass
|
|
@ -1,4 +0,0 @@
|
|||||||
# This package will contain the spiders of your Scrapy project
|
|
||||||
#
|
|
||||||
# Please refer to the documentation for information on how to create and manage
|
|
||||||
# your spiders.
|
|
Reference in New Issue
Block a user