Merge branch 'feature/basic-structure' into develop
This commit is contained in:
commit
aa65bbd459
22
Fourmi.py
Executable file
22
Fourmi.py
Executable file
@ -0,0 +1,22 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
"""
|
||||||
|
Fourmi - An internet webcrawler searching for information on chemical compounds.
|
||||||
|
[todo] - Add some more useful text here.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from twisted.internet import reactor
|
||||||
|
from scrapy.crawler import Crawler
|
||||||
|
from scrapy import log, signals
|
||||||
|
from FourmiCrawler.spiders.Chemspider import ChemspiderSpider # [review] - There should be an easy way to import all spiders!
|
||||||
|
from scrapy.utils.project import get_project_settings
|
||||||
|
|
||||||
|
# [todo] - Add something to add all spiders, with the right references
|
||||||
|
spider = ChemspiderSpider(compound = "Aspirin")
|
||||||
|
settings = get_project_settings()
|
||||||
|
crawler = Crawler(settings)
|
||||||
|
crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
|
||||||
|
crawler.configure()
|
||||||
|
crawler.crawl(spider)
|
||||||
|
crawler.start()
|
||||||
|
log.start()
|
||||||
|
reactor.run()
|
25
FourmiCrawler/pipelines.py
Normal file
25
FourmiCrawler/pipelines.py
Normal file
@ -0,0 +1,25 @@
|
|||||||
|
# Define your item pipelines here
|
||||||
|
#
|
||||||
|
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
|
||||||
|
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
|
||||||
|
from scrapy.exceptions import DropItem
|
||||||
|
|
||||||
|
|
||||||
|
class FourmiPipeline(object):
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.known_values = set()
|
||||||
|
|
||||||
|
def process_item(self, item, spider):
|
||||||
|
"""
|
||||||
|
Processing the items so exact doubles are dropped
|
||||||
|
:param item: The incoming item
|
||||||
|
:param spider: The spider which scraped the spider
|
||||||
|
:return: :raise DropItem: Returns the item if unique or drops them if it's already known
|
||||||
|
"""
|
||||||
|
value = item['attribute'], item['value']
|
||||||
|
if value in self.known_values:
|
||||||
|
raise DropItem("Duplicate item found: %s" % item)
|
||||||
|
else:
|
||||||
|
self.known_values.add(value)
|
||||||
|
return item
|
@ -6,10 +6,13 @@
|
|||||||
# http://doc.scrapy.org/en/latest/topics/settings.html
|
# http://doc.scrapy.org/en/latest/topics/settings.html
|
||||||
#
|
#
|
||||||
|
|
||||||
BOT_NAME = 'Fourmi'
|
BOT_NAME = 'FourmiCrawler'
|
||||||
|
|
||||||
SPIDER_MODULES = ['Scrapy.spiders']
|
SPIDER_MODULES = ['FourmiCrawler.spiders']
|
||||||
NEWSPIDER_MODULE = 'Scrapy.spiders'
|
NEWSPIDER_MODULE = 'FourmiCrawler.spiders'
|
||||||
|
ITEM_PIPELINES = {
|
||||||
|
'FourmiCrawler.pipelines.FourmiPipeline': 100
|
||||||
|
}
|
||||||
|
|
||||||
# Crawl responsibly by identifying yourself (and your website) on the user-agent
|
# Crawl responsibly by identifying yourself (and your website) on the user-agent
|
||||||
#USER_AGENT = 'Fourmi (+http://www.yourdomain.com)'
|
#USER_AGENT = 'FourmiCrawler (+http://www.yourdomain.com)'
|
12
FourmiCrawler/spiders/Chemspider.py
Normal file
12
FourmiCrawler/spiders/Chemspider.py
Normal file
@ -0,0 +1,12 @@
|
|||||||
|
from scrapy.spider import Spider
|
||||||
|
|
||||||
|
class ChemspiderSpider(Spider):
|
||||||
|
name = "Chemspider"
|
||||||
|
allowed_domains = ["chemspider.com"]
|
||||||
|
|
||||||
|
def __init__(self, compound=None, *args, **kwargs):
|
||||||
|
super(ChemspiderSpider, self).__init__(*args, **kwargs)
|
||||||
|
self.start_urls = ["http://chemspiderapiurl/something/%s" % compound] #[TODO] - Give an logical start url.
|
||||||
|
|
||||||
|
def parse(self, response):
|
||||||
|
pass
|
12
FourmiCrawler/spiders/Wikipedia.py
Normal file
12
FourmiCrawler/spiders/Wikipedia.py
Normal file
@ -0,0 +1,12 @@
|
|||||||
|
from scrapy.spider import Spider
|
||||||
|
|
||||||
|
class WikipediaSpider(Spider):
|
||||||
|
name = "Wikipedia"
|
||||||
|
allowed_domains = ["wikipedia.org"]
|
||||||
|
|
||||||
|
def __init__(self, compound=None, *args, **kwargs):
|
||||||
|
super(WikipediaSpider, self).__init__(*args, **kwargs)
|
||||||
|
self.start_urls = ["http://wikipediaurl/something/%s" % compound] #[TODO] - Give an logical start url.
|
||||||
|
|
||||||
|
def parse(self, response):
|
||||||
|
pass
|
@ -1,8 +0,0 @@
|
|||||||
# Define your item pipelines here
|
|
||||||
#
|
|
||||||
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
|
|
||||||
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
|
|
||||||
|
|
||||||
class FourmiPipeline(object):
|
|
||||||
def process_item(self, item, spider):
|
|
||||||
return item
|
|
@ -1,11 +0,0 @@
|
|||||||
from scrapy.spider import Spider
|
|
||||||
|
|
||||||
class ChemspiderSpider(Spider):
|
|
||||||
name = "Chemspider"
|
|
||||||
allowed_domains = ["chemspider.com"]
|
|
||||||
start_urls = (
|
|
||||||
'http://www.chemspider.com/',
|
|
||||||
)
|
|
||||||
|
|
||||||
def parse(self, response):
|
|
||||||
pass
|
|
@ -1,11 +0,0 @@
|
|||||||
from scrapy.spider import Spider
|
|
||||||
|
|
||||||
class WikipediaSpider(Spider):
|
|
||||||
name = "Wikipedia"
|
|
||||||
allowed_domains = ["wikipedia.org"]
|
|
||||||
start_urls = (
|
|
||||||
'http://www.wikipedia.org/',
|
|
||||||
)
|
|
||||||
|
|
||||||
def parse(self, response):
|
|
||||||
pass
|
|
@ -4,7 +4,7 @@
|
|||||||
# http://doc.scrapy.org/en/latest/topics/scrapyd.html
|
# http://doc.scrapy.org/en/latest/topics/scrapyd.html
|
||||||
|
|
||||||
[settings]
|
[settings]
|
||||||
default = Scrapy.settings
|
default = FourmiCrawler.settings
|
||||||
|
|
||||||
[deploy]
|
[deploy]
|
||||||
#url = http://localhost:6800/
|
#url = http://localhost:6800/
|
||||||
|
Reference in New Issue
Block a user