Merge branch 'release/basic-scraper-structure'
This commit is contained in:
commit
e0556bbf16
3
.gitignore
vendored
3
.gitignore
vendored
@ -1,6 +1,9 @@
|
||||
#EDITOR AND IDE SPECIFIC SETTINGFILES
|
||||
.idea
|
||||
|
||||
#Python Specific ignores
|
||||
*.pyc
|
||||
|
||||
#THINGS WE WOULD NEVER EVER WANT!
|
||||
#ignore thumbnails created by windows
|
||||
Thumbs.db
|
||||
|
31
Fourmi.py
Executable file
31
Fourmi.py
Executable file
@ -0,0 +1,31 @@
|
||||
#!/usr/bin/env python
|
||||
"""
|
||||
Fourmi - An internet webcrawler searching for information on chemical
|
||||
compounds. [todo] - Add some more useful text here.
|
||||
"""
|
||||
|
||||
from twisted.internet import reactor
|
||||
from scrapy.crawler import Crawler
|
||||
from scrapy import log, signals
|
||||
from FourmiCrawler.spiders.Fourmispider import FourmiSpider
|
||||
from scrapy.utils.project import get_project_settings
|
||||
|
||||
|
||||
def setup_crawler(searchable):
|
||||
# [TODO] - Initiate all parsers for the different websites and get
|
||||
# allowed URLs.
|
||||
spider = FourmiSpider(compound=searchable)
|
||||
settings = get_project_settings()
|
||||
crawler = Crawler(settings)
|
||||
crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
|
||||
crawler.configure()
|
||||
crawler.crawl(spider)
|
||||
crawler.start()
|
||||
|
||||
|
||||
def start():
|
||||
setup_crawler("Methane")
|
||||
log.start()
|
||||
reactor.run()
|
||||
|
||||
start()
|
@ -5,7 +5,10 @@
|
||||
|
||||
from scrapy.item import Item, Field
|
||||
|
||||
class FourmiItem(Item):
|
||||
# define the fields for your item here like:
|
||||
# name = Field()
|
||||
pass
|
||||
|
||||
class Result(Item):
|
||||
attribute = Field()
|
||||
value = Field()
|
||||
source = Field()
|
||||
reliability = Field()
|
||||
conditions = Field()
|
0
FourmiCrawler/parsers/__init__.py
Normal file
0
FourmiCrawler/parsers/__init__.py
Normal file
9
FourmiCrawler/parsers/parser.py
Normal file
9
FourmiCrawler/parsers/parser.py
Normal file
@ -0,0 +1,9 @@
|
||||
from scrapy import log
|
||||
|
||||
|
||||
class Parser:
|
||||
website = "http://localhost/*"
|
||||
|
||||
def parse(self, reponse):
|
||||
log.msg("The parse function of the empty parser was used.", level=log.Warning)
|
||||
pass
|
25
FourmiCrawler/pipelines.py
Normal file
25
FourmiCrawler/pipelines.py
Normal file
@ -0,0 +1,25 @@
|
||||
# Define your item pipelines here
|
||||
#
|
||||
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
|
||||
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
|
||||
from scrapy.exceptions import DropItem
|
||||
|
||||
|
||||
class FourmiPipeline(object):
|
||||
|
||||
def __init__(self):
|
||||
self.known_values = set()
|
||||
|
||||
def process_item(self, item, spider):
|
||||
"""
|
||||
Processing the items so exact doubles are dropped
|
||||
:param item: The incoming item
|
||||
:param spider: The spider which scraped the spider
|
||||
:return: :raise DropItem: Returns the item if unique or drops them if it's already known
|
||||
"""
|
||||
value = item['attribute'], item['value']
|
||||
if value in self.known_values:
|
||||
raise DropItem("Duplicate item found: %s" % item)
|
||||
else:
|
||||
self.known_values.add(value)
|
||||
return item
|
@ -6,10 +6,15 @@
|
||||
# http://doc.scrapy.org/en/latest/topics/settings.html
|
||||
#
|
||||
|
||||
BOT_NAME = 'Fourmi'
|
||||
BOT_NAME = 'FourmiCrawler'
|
||||
|
||||
SPIDER_MODULES = ['Scrapy.spiders']
|
||||
NEWSPIDER_MODULE = 'Scrapy.spiders'
|
||||
SPIDER_MODULES = ['FourmiCrawler']
|
||||
NEWSPIDER_MODULE = 'FourmiCrawler'
|
||||
ITEM_PIPELINES = {
|
||||
'FourmiCrawler.pipelines.FourmiPipeline': 100
|
||||
}
|
||||
|
||||
# Crawl responsibly by identifying yourself (and your website) on the user-agent
|
||||
#USER_AGENT = 'Fourmi (+http://www.yourdomain.com)'
|
||||
# Crawl responsibly by identifying yourself (and your website) on the
|
||||
# user-agent
|
||||
|
||||
# USER_AGENT = 'FourmiCrawler (+http://www.yourdomain.com)'
|
16
FourmiCrawler/spider.py
Normal file
16
FourmiCrawler/spider.py
Normal file
@ -0,0 +1,16 @@
|
||||
from scrapy.spider import Spider
|
||||
|
||||
|
||||
class FourmiSpider(Spider):
|
||||
name = "FourmiSpider"
|
||||
|
||||
def __init__(self, compound=None, *args, **kwargs):
|
||||
super(FourmiSpider, self).__init__(*args, **kwargs)
|
||||
|
||||
def parse(self, reponse):
|
||||
# [TODO] - This function should delegate it's functionality to other
|
||||
# parsers.
|
||||
pass
|
||||
|
||||
def add_parser(self, parser):
|
||||
self.parsers.add(parser)
|
Binary file not shown.
@ -1,8 +0,0 @@
|
||||
# Define your item pipelines here
|
||||
#
|
||||
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
|
||||
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
|
||||
|
||||
class FourmiPipeline(object):
|
||||
def process_item(self, item, spider):
|
||||
return item
|
Binary file not shown.
@ -1,4 +0,0 @@
|
||||
# This package will contain the spiders of your Scrapy project
|
||||
#
|
||||
# Please refer to the documentation for information on how to create and manage
|
||||
# your spiders.
|
Binary file not shown.
@ -4,7 +4,7 @@
|
||||
# http://doc.scrapy.org/en/latest/topics/scrapyd.html
|
||||
|
||||
[settings]
|
||||
default = Scrapy.settings
|
||||
default = FourmiCrawler.settings
|
||||
|
||||
[deploy]
|
||||
#url = http://localhost:6800/
|
||||
|
Reference in New Issue
Block a user