diff --git a/fourmi.py b/fourmi.py index f421ba9..232f1f5 100755 --- a/fourmi.py +++ b/fourmi.py @@ -18,39 +18,19 @@ Options: -f --format= Output formats (supported: csv, json, jsonlines, xml) [default: jsonlines] """ -import os -import inspect - from twisted.internet import reactor from scrapy.crawler import Crawler from scrapy import log, signals from scrapy.utils.project import get_project_settings import docopt -from FourmiCrawler.parsers.parser import Parser from FourmiCrawler.spider import FourmiSpider from sourceloader import SourceLoader -def load_parsers(rel_dir="FourmiCrawler/parsers"): - path = os.path.dirname(os.path.abspath(__file__)) - path += "/" + rel_dir - parsers = [] - known_parser = set() - - for py in [f[:-3] for f in os.listdir(path) if f.endswith('.py') and f != '__init__.py']: - mod = __import__('.'.join([rel_dir.replace("/", "."), py]), fromlist=[py]) - classes = [getattr(mod, x) for x in dir(mod) if inspect.isclass(getattr(mod, x))] - for cls in classes: - if issubclass(cls, Parser) and cls not in known_parser: - parsers.append(cls()) # [review] - Would we ever need arguments for the parsers? - known_parser.add(cls) - return parsers - - -def setup_crawler(searchable, settings): +def setup_crawler(searchable, settings, loader): spider = FourmiSpider(compound=searchable) - spider.add_parsers(load_parsers()) + spider.add_parsers(loader.sources) crawler = Crawler(settings) crawler.signals.connect(reactor.stop, signal=signals.spider_closed) crawler.configure() @@ -86,10 +66,10 @@ def start_log(arguments): else: log.start(logstdout=True, loglevel=log.WARNING) -def search(arguments): +def search(arguments, loader): start_log(arguments) settings = scrapy_settings_manipulation(arguments) - setup_crawler([arguments[""]], settings) + setup_crawler([arguments[""]], settings, loader) reactor.run() @@ -98,7 +78,7 @@ if __name__ == '__main__': loader = SourceLoader() if arguments["search"]: - search(arguments) + search(arguments, loader) elif arguments["list"]: print "-== Available Sources ==-" print str(loader) \ No newline at end of file