Archived
1
0

Removed redundant source loader

This commit is contained in:
Jip J. Dekker 2014-04-16 10:36:46 +02:00
parent a06bf643f1
commit 7b57d86178

View File

@ -18,39 +18,19 @@ Options:
-f <format> --format=<format> Output formats (supported: csv, json, jsonlines, xml) [default: jsonlines] -f <format> --format=<format> Output formats (supported: csv, json, jsonlines, xml) [default: jsonlines]
""" """
import os
import inspect
from twisted.internet import reactor from twisted.internet import reactor
from scrapy.crawler import Crawler from scrapy.crawler import Crawler
from scrapy import log, signals from scrapy import log, signals
from scrapy.utils.project import get_project_settings from scrapy.utils.project import get_project_settings
import docopt import docopt
from FourmiCrawler.parsers.parser import Parser
from FourmiCrawler.spider import FourmiSpider from FourmiCrawler.spider import FourmiSpider
from sourceloader import SourceLoader from sourceloader import SourceLoader
def load_parsers(rel_dir="FourmiCrawler/parsers"): def setup_crawler(searchable, settings, loader):
path = os.path.dirname(os.path.abspath(__file__))
path += "/" + rel_dir
parsers = []
known_parser = set()
for py in [f[:-3] for f in os.listdir(path) if f.endswith('.py') and f != '__init__.py']:
mod = __import__('.'.join([rel_dir.replace("/", "."), py]), fromlist=[py])
classes = [getattr(mod, x) for x in dir(mod) if inspect.isclass(getattr(mod, x))]
for cls in classes:
if issubclass(cls, Parser) and cls not in known_parser:
parsers.append(cls()) # [review] - Would we ever need arguments for the parsers?
known_parser.add(cls)
return parsers
def setup_crawler(searchable, settings):
spider = FourmiSpider(compound=searchable) spider = FourmiSpider(compound=searchable)
spider.add_parsers(load_parsers()) spider.add_parsers(loader.sources)
crawler = Crawler(settings) crawler = Crawler(settings)
crawler.signals.connect(reactor.stop, signal=signals.spider_closed) crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
crawler.configure() crawler.configure()
@ -86,10 +66,10 @@ def start_log(arguments):
else: else:
log.start(logstdout=True, loglevel=log.WARNING) log.start(logstdout=True, loglevel=log.WARNING)
def search(arguments): def search(arguments, loader):
start_log(arguments) start_log(arguments)
settings = scrapy_settings_manipulation(arguments) settings = scrapy_settings_manipulation(arguments)
setup_crawler([arguments["<compound>"]], settings) setup_crawler([arguments["<compound>"]], settings, loader)
reactor.run() reactor.run()
@ -98,7 +78,7 @@ if __name__ == '__main__':
loader = SourceLoader() loader = SourceLoader()
if arguments["search"]: if arguments["search"]:
search(arguments) search(arguments, loader)
elif arguments["list"]: elif arguments["list"]:
print "-== Available Sources ==-" print "-== Available Sources ==-"
print str(loader) print str(loader)