From 4d9e5307bf0c00f1db07511affd1a7c389efe812 Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Mon, 31 Mar 2014 00:48:45 +0200 Subject: [PATCH] Written an loader for all parsers in the parser directory. --- Fourmi.py | 16 +++++++++++++--- FourmiCrawler/spider.py | 4 +++- 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/Fourmi.py b/Fourmi.py index 094a5d7..c411b4a 100755 --- a/Fourmi.py +++ b/Fourmi.py @@ -9,13 +9,23 @@ from scrapy.crawler import Crawler from scrapy import log, signals from FourmiCrawler.spider import FourmiSpider from scrapy.utils.project import get_project_settings -from FourmiCrawler.parsers.parser import Parser +import os, inspect +def load_parsers(rel_dir="FourmiCrawler/parsers"): + path = os.path.dirname(os.path.abspath(__file__)) + path += "/" + rel_dir + parsers = [] + + for py in [f[:-3] for f in os.listdir(path) if f.endswith('.py') and f != '__init__.py']: + mod = __import__('.'.join(["FourmiCrawler.parsers", py]), fromlist=[py]) # [todo] - This module name should be derived from the rel_dir variable + classes = [getattr(mod, x) for x in dir(mod) if inspect.isclass(getattr(mod, x))] + for cls in classes: + parsers.append(cls()) # [review] - Would we ever need arguments for the parsers? + return parsers def setup_crawler(searchable): - # [TODO] - Initiate all parsers for the different websites and get allowed URLs. spider = FourmiSpider(compound=searchable) - spider.add_parser(Parser()) + spider.add_parsers(load_parsers()) settings = get_project_settings() crawler = Crawler(settings) crawler.signals.connect(reactor.stop, signal=signals.spider_closed) diff --git a/FourmiCrawler/spider.py b/FourmiCrawler/spider.py index 40d6dfc..d2711c4 100644 --- a/FourmiCrawler/spider.py +++ b/FourmiCrawler/spider.py @@ -5,7 +5,6 @@ import re class FourmiSpider(Spider): name = "FourmiSpider" - start_urls = ["http://localhost/"] parsers = [] def __init__(self, compound=None, *args, **kwargs): @@ -22,3 +21,6 @@ class FourmiSpider(Spider): def add_parser(self, parser): self.parsers.append(parser) + + def add_parsers(self, parsers): + self.parsers.extend(parsers)