Archived
1
0

Merge branch 'release/v0.0.1'

This commit is contained in:
Jip J. Dekker 2014-04-01 21:44:08 +02:00
commit 254e8db3aa
3 changed files with 75 additions and 25 deletions

View File

@ -2,6 +2,7 @@
""" """
Fourmi - An internet webcrawler searching for information on chemical Fourmi - An internet webcrawler searching for information on chemical
compounds. [todo] - Add some more useful text here. compounds. [todo] - Add some more useful text here.
Version: v0.0.1 - Empty Application that could do something but all logic of websites isn't there yet!
""" """
from twisted.internet import reactor from twisted.internet import reactor
@ -9,23 +10,36 @@ from scrapy.crawler import Crawler
from scrapy import log, signals from scrapy import log, signals
from FourmiCrawler.spider import FourmiSpider from FourmiCrawler.spider import FourmiSpider
from scrapy.utils.project import get_project_settings from scrapy.utils.project import get_project_settings
import os, inspect, re
def load_parsers(rel_dir="FourmiCrawler/parsers"):
path = os.path.dirname(os.path.abspath(__file__))
path += "/" + rel_dir
parsers = []
def setup_crawler(searchable): for py in [f[:-3] for f in os.listdir(path) if f.endswith('.py') and f != '__init__.py']:
# [TODO] - Initiate all parsers for the different websites and get mod = __import__('.'.join([rel_dir.replace("/", "."), py]), fromlist=[py])
# allowed URLs. classes = [getattr(mod, x) for x in dir(mod) if inspect.isclass(getattr(mod, x))]
spider = FourmiSpider(compound=searchable) for cls in classes:
settings = get_project_settings() if re.match(path + "/*", inspect.getfile(cls)):
crawler = Crawler(settings) parsers.append(cls()) # [review] - Would we ever need arguments for the parsers?
crawler.signals.connect(reactor.stop, signal=signals.spider_closed) return parsers
crawler.configure()
crawler.crawl(spider) def setup_crawler(searchables):
crawler.start() spider = FourmiSpider(compounds=searchables)
spider.add_parsers(load_parsers())
settings = get_project_settings()
crawler = Crawler(settings)
crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
crawler.configure()
crawler.crawl(spider)
crawler.start()
def start(): def start():
setup_crawler("Methane") setup_crawler(["Methane"])
log.start() log.start()
reactor.run() reactor.run()
start() start()

View File

@ -1,9 +1,21 @@
from scrapy import log from scrapy import log
# from scrapy.http import Request
class Parser: class Parser:
website = "http://localhost/*" '''
website should be an regular expression of the urls of request the parser is able to parse.
'''
website = "http://something/*"
__spider = None
def parse(self, reponse): def parse(self, reponse):
log.msg("The parse function of the empty parser was used.", level=log.Warning) log.msg("The parse function of the empty parser was used.", level=log.WARNING)
pass pass
def new_compound_request(self, compound):
# return Request(url=self.website[:-1] + compound, callback=self.parse)
pass
def set_spider(self, spider):
self.__spider = spider

View File

@ -1,19 +1,43 @@
from scrapy.spider import Spider from scrapy.spider import Spider
from scrapy import log
import re
class FourmiSpider(Spider): class FourmiSpider(Spider):
name = "FourmiSpider" name = "FourmiSpider"
__parsers = []
synonyms = []
def __init__(self, compound=None, *args, **kwargs): def __init__(self, compounds=None, *args, **kwargs):
super(FourmiSpider, self).__init__(*args, **kwargs) super(FourmiSpider, self).__init__(*args, **kwargs)
self.synonyms = [compound] if isinstance(compounds, list):
self.synonyms.extend(compounds)
else:
self.synonyms.append(compounds)
def parse(self, reponse):
for parser in self.__parsers:
if re.match(parser.website, reponse.url):
log.msg("Url: " + reponse.url + " -> Parser: " + parser.website, level=log.DEBUG)
return parser.parse(reponse)
return None
def parse(self, reponse): def get_synonym_requests(self, compound):
# [TODO] - This function should delegate it's functionality to other requests = []
# parsers. for parser in self.__parsers:
pass requests.append(parser.new_compound_request(compound))
return requests
def start_requests(self):
requests = []
for synonym in self.synonyms:
requests.extend(self.get_synonym_requests(synonym))
return requests
def add_parser(self, parser): def add_parsers(self, parsers):
self.parsers.add(parser) for parser in parsers:
self.add_parser(parser)
def add_parser(self, parser):
self.__parsers.append(parser)
parser.set_spider(self)