Archived
1
0

Merge branch 'develop' into feature/Wikipedia

This commit is contained in:
Jip J. Dekker 2014-04-08 11:45:23 +02:00
commit e10ac12d04
2 changed files with 7 additions and 2 deletions

View File

@ -8,6 +8,7 @@ Version: v0.0.1 - Empty Application that could do something but all logic of web
from twisted.internet import reactor from twisted.internet import reactor
from scrapy.crawler import Crawler from scrapy.crawler import Crawler
from scrapy import log, signals from scrapy import log, signals
from FourmiCrawler.parsers.parser import Parser
from FourmiCrawler.spider import FourmiSpider from FourmiCrawler.spider import FourmiSpider
from scrapy.utils.project import get_project_settings from scrapy.utils.project import get_project_settings
import os, inspect, re import os, inspect, re
@ -16,13 +17,15 @@ def load_parsers(rel_dir="FourmiCrawler/parsers"):
path = os.path.dirname(os.path.abspath(__file__)) path = os.path.dirname(os.path.abspath(__file__))
path += "/" + rel_dir path += "/" + rel_dir
parsers = [] parsers = []
known_parser = set()
for py in [f[:-3] for f in os.listdir(path) if f.endswith('.py') and f != '__init__.py']: for py in [f[:-3] for f in os.listdir(path) if f.endswith('.py') and f != '__init__.py']:
mod = __import__('.'.join([rel_dir.replace("/", "."), py]), fromlist=[py]) mod = __import__('.'.join([rel_dir.replace("/", "."), py]), fromlist=[py])
classes = [getattr(mod, x) for x in dir(mod) if inspect.isclass(getattr(mod, x))] classes = [getattr(mod, x) for x in dir(mod) if inspect.isclass(getattr(mod, x))]
for cls in classes: for cls in classes:
if re.match(path + "/*", inspect.getfile(cls)): if issubclass(cls, Parser) and cls not in known_parser:
parsers.append(cls()) # [review] - Would we ever need arguments for the parsers? parsers.append(cls()) # [review] - Would we ever need arguments for the parsers?
known_parser.add(cls)
return parsers return parsers
def setup_crawler(searchables): def setup_crawler(searchables):

View File

@ -25,7 +25,9 @@ class FourmiSpider(Spider):
def get_synonym_requests(self, compound): def get_synonym_requests(self, compound):
requests = [] requests = []
for parser in self.__parsers: for parser in self.__parsers:
requests.append(parser.new_compound_request(compound)) parser_requests = parser.new_compound_request(compound)
if parser_requests is not None:
requests.append(parser_requests)
return requests return requests
def start_requests(self): def start_requests(self):