Archived
1
0

Removed redundant source loader

This commit is contained in:
Jip J. Dekker 2014-04-16 10:36:46 +02:00
parent a06bf643f1
commit 7b57d86178

View File

@ -18,39 +18,19 @@ Options:
-f <format> --format=<format> Output formats (supported: csv, json, jsonlines, xml) [default: jsonlines]
"""
import os
import inspect
from twisted.internet import reactor
from scrapy.crawler import Crawler
from scrapy import log, signals
from scrapy.utils.project import get_project_settings
import docopt
from FourmiCrawler.parsers.parser import Parser
from FourmiCrawler.spider import FourmiSpider
from sourceloader import SourceLoader
def load_parsers(rel_dir="FourmiCrawler/parsers"):
path = os.path.dirname(os.path.abspath(__file__))
path += "/" + rel_dir
parsers = []
known_parser = set()
for py in [f[:-3] for f in os.listdir(path) if f.endswith('.py') and f != '__init__.py']:
mod = __import__('.'.join([rel_dir.replace("/", "."), py]), fromlist=[py])
classes = [getattr(mod, x) for x in dir(mod) if inspect.isclass(getattr(mod, x))]
for cls in classes:
if issubclass(cls, Parser) and cls not in known_parser:
parsers.append(cls()) # [review] - Would we ever need arguments for the parsers?
known_parser.add(cls)
return parsers
def setup_crawler(searchable, settings):
def setup_crawler(searchable, settings, loader):
spider = FourmiSpider(compound=searchable)
spider.add_parsers(load_parsers())
spider.add_parsers(loader.sources)
crawler = Crawler(settings)
crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
crawler.configure()
@ -86,10 +66,10 @@ def start_log(arguments):
else:
log.start(logstdout=True, loglevel=log.WARNING)
def search(arguments):
def search(arguments, loader):
start_log(arguments)
settings = scrapy_settings_manipulation(arguments)
setup_crawler([arguments["<compound>"]], settings)
setup_crawler([arguments["<compound>"]], settings, loader)
reactor.run()
@ -98,7 +78,7 @@ if __name__ == '__main__':
loader = SourceLoader()
if arguments["search"]:
search(arguments)
search(arguments, loader)
elif arguments["list"]:
print "-== Available Sources ==-"
print str(loader)