Archived
1
0

Written an loader for all parsers in the parser directory.

This commit is contained in:
Jip J. Dekker 2014-03-31 00:48:45 +02:00
parent 0cc1b23353
commit 4d9e5307bf
2 changed files with 16 additions and 4 deletions

View File

@ -9,13 +9,23 @@ from scrapy.crawler import Crawler
from scrapy import log, signals
from FourmiCrawler.spider import FourmiSpider
from scrapy.utils.project import get_project_settings
from FourmiCrawler.parsers.parser import Parser
import os, inspect
def load_parsers(rel_dir="FourmiCrawler/parsers"):
path = os.path.dirname(os.path.abspath(__file__))
path += "/" + rel_dir
parsers = []
for py in [f[:-3] for f in os.listdir(path) if f.endswith('.py') and f != '__init__.py']:
mod = __import__('.'.join(["FourmiCrawler.parsers", py]), fromlist=[py]) # [todo] - This module name should be derived from the rel_dir variable
classes = [getattr(mod, x) for x in dir(mod) if inspect.isclass(getattr(mod, x))]
for cls in classes:
parsers.append(cls()) # [review] - Would we ever need arguments for the parsers?
return parsers
def setup_crawler(searchable):
# [TODO] - Initiate all parsers for the different websites and get allowed URLs.
spider = FourmiSpider(compound=searchable)
spider.add_parser(Parser())
spider.add_parsers(load_parsers())
settings = get_project_settings()
crawler = Crawler(settings)
crawler.signals.connect(reactor.stop, signal=signals.spider_closed)

View File

@ -5,7 +5,6 @@ import re
class FourmiSpider(Spider):
name = "FourmiSpider"
start_urls = ["http://localhost/"]
parsers = []
def __init__(self, compound=None, *args, **kwargs):
@ -22,3 +21,6 @@ class FourmiSpider(Spider):
def add_parser(self, parser):
self.parsers.append(parser)
def add_parsers(self, parsers):
self.parsers.extend(parsers)