Archived
1
0

Merge branch 'release/basic-scraper-structure'

This commit is contained in:
Jip J. Dekker 2014-03-30 22:16:13 +02:00
commit e0556bbf16
15 changed files with 102 additions and 22 deletions

3
.gitignore vendored
View File

@ -1,6 +1,9 @@
#EDITOR AND IDE SPECIFIC SETTINGFILES
.idea
#Python Specific ignores
*.pyc
#THINGS WE WOULD NEVER EVER WANT!
#ignore thumbnails created by windows
Thumbs.db

31
Fourmi.py Executable file
View File

@ -0,0 +1,31 @@
#!/usr/bin/env python
"""
Fourmi - An internet webcrawler searching for information on chemical
compounds. [todo] - Add some more useful text here.
"""
from twisted.internet import reactor
from scrapy.crawler import Crawler
from scrapy import log, signals
from FourmiCrawler.spiders.Fourmispider import FourmiSpider
from scrapy.utils.project import get_project_settings
def setup_crawler(searchable):
# [TODO] - Initiate all parsers for the different websites and get
# allowed URLs.
spider = FourmiSpider(compound=searchable)
settings = get_project_settings()
crawler = Crawler(settings)
crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
crawler.configure()
crawler.crawl(spider)
crawler.start()
def start():
setup_crawler("Methane")
log.start()
reactor.run()
start()

View File

@ -5,7 +5,10 @@
from scrapy.item import Item, Field
class FourmiItem(Item):
# define the fields for your item here like:
# name = Field()
pass
class Result(Item):
attribute = Field()
value = Field()
source = Field()
reliability = Field()
conditions = Field()

View File

View File

@ -0,0 +1,9 @@
from scrapy import log
class Parser:
website = "http://localhost/*"
def parse(self, reponse):
log.msg("The parse function of the empty parser was used.", level=log.Warning)
pass

View File

@ -0,0 +1,25 @@
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
from scrapy.exceptions import DropItem
class FourmiPipeline(object):
def __init__(self):
self.known_values = set()
def process_item(self, item, spider):
"""
Processing the items so exact doubles are dropped
:param item: The incoming item
:param spider: The spider which scraped the spider
:return: :raise DropItem: Returns the item if unique or drops them if it's already known
"""
value = item['attribute'], item['value']
if value in self.known_values:
raise DropItem("Duplicate item found: %s" % item)
else:
self.known_values.add(value)
return item

View File

@ -6,10 +6,15 @@
# http://doc.scrapy.org/en/latest/topics/settings.html
#
BOT_NAME = 'Fourmi'
BOT_NAME = 'FourmiCrawler'
SPIDER_MODULES = ['Scrapy.spiders']
NEWSPIDER_MODULE = 'Scrapy.spiders'
SPIDER_MODULES = ['FourmiCrawler']
NEWSPIDER_MODULE = 'FourmiCrawler'
ITEM_PIPELINES = {
'FourmiCrawler.pipelines.FourmiPipeline': 100
}
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'Fourmi (+http://www.yourdomain.com)'
# Crawl responsibly by identifying yourself (and your website) on the
# user-agent
# USER_AGENT = 'FourmiCrawler (+http://www.yourdomain.com)'

16
FourmiCrawler/spider.py Normal file
View File

@ -0,0 +1,16 @@
from scrapy.spider import Spider
class FourmiSpider(Spider):
name = "FourmiSpider"
def __init__(self, compound=None, *args, **kwargs):
super(FourmiSpider, self).__init__(*args, **kwargs)
def parse(self, reponse):
# [TODO] - This function should delegate it's functionality to other
# parsers.
pass
def add_parser(self, parser):
self.parsers.add(parser)

Binary file not shown.

View File

@ -1,8 +0,0 @@
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
class FourmiPipeline(object):
def process_item(self, item, spider):
return item

Binary file not shown.

View File

@ -1,4 +0,0 @@
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.

Binary file not shown.

View File

@ -4,7 +4,7 @@
# http://doc.scrapy.org/en/latest/topics/scrapyd.html
[settings]
default = Scrapy.settings
default = FourmiCrawler.settings
[deploy]
#url = http://localhost:6800/