Archived
1
0

Merge branch 'develop' into feature/Wikipedia

Conflicts:
	Fourmi.py
This commit is contained in:
Jip J. Dekker 2014-04-16 16:49:03 +02:00
commit efacc08a3d
4 changed files with 99 additions and 53 deletions

View File

@ -1,48 +0,0 @@
#!/usr/bin/env python
"""
Fourmi - An internet webcrawler searching for information on chemical
compounds. [todo] - Add some more useful text here.
Version: v0.0.1 - Empty Application that could do something but all logic of websites isn't there yet!
"""
from twisted.internet import reactor
from scrapy.crawler import Crawler
from scrapy import log, signals
from FourmiCrawler.parsers.parser import Parser
from FourmiCrawler.spider import FourmiSpider
from scrapy.utils.project import get_project_settings
import os, inspect, re
def load_parsers(rel_dir="FourmiCrawler/parsers"):
path = os.path.dirname(os.path.abspath(__file__))
path += "/" + rel_dir
parsers = []
known_parser = set()
for py in [f[:-3] for f in os.listdir(path) if f.endswith('.py') and f != '__init__.py']:
mod = __import__('.'.join([rel_dir.replace("/", "."), py]), fromlist=[py])
classes = [getattr(mod, x) for x in dir(mod) if inspect.isclass(getattr(mod, x))]
for cls in classes:
if issubclass(cls, Parser) and cls not in known_parser:
parsers.append(cls()) # [review] - Would we ever need arguments for the parsers?
known_parser.add(cls)
return parsers
def setup_crawler(searchables):
spider = FourmiSpider(compounds=searchables)
spider.add_parsers(load_parsers())
settings = get_project_settings()
crawler = Crawler(settings)
crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
crawler.configure()
crawler.crawl(spider)
crawler.start()
def start():
setup_crawler(["Methane"])
log.start(logstdout=False)
reactor.run()
start()

View File

@ -13,6 +13,9 @@ NEWSPIDER_MODULE = 'FourmiCrawler'
ITEM_PIPELINES = {
'FourmiCrawler.pipelines.FourmiPipeline': 100
}
FEED_URI = 'results.json'
FEED_FORMAT = 'jsonlines'
# Crawl responsibly by identifying yourself (and your website) on the
# user-agent

View File

@ -8,12 +8,9 @@ class FourmiSpider(Spider):
__parsers = []
synonyms = []
def __init__(self, compounds=None, *args, **kwargs):
def __init__(self, compound=None, *args, **kwargs):
super(FourmiSpider, self).__init__(*args, **kwargs)
if isinstance(compounds, list):
self.synonyms.extend(compounds)
else:
self.synonyms.append(compounds)
self.synonyms.append(compound)
def parse(self, reponse):
for parser in self.__parsers:

94
fourmi.py Executable file
View File

@ -0,0 +1,94 @@
#!/usr/bin/env python
"""
Fourmi, an web scraper build to search specific information for a given compound (and it's pseudonyms).
Usage:
fourmi search <compound>
fourmi [options] search <compound>
fourmi -h | --help
fourmi --version
Options:
-h --help Show this screen.
--version Show version.
--verbose Verbose logging output.
--log=<file> Save log to an file.
-o <file> --output=<file> Output file [default: result.*format*]
-f <format> --format=<format> Output formats (supported: csv, json, jsonlines, xml) [default: jsonlines]
"""
import os
import inspect
from twisted.internet import reactor
from scrapy.crawler import Crawler
from scrapy import log, signals
from scrapy.utils.project import get_project_settings
import docopt
from FourmiCrawler.parsers.parser import Parser
from FourmiCrawler.spider import FourmiSpider
def load_parsers(rel_dir="FourmiCrawler/parsers"):
path = os.path.dirname(os.path.abspath(__file__))
path += "/" + rel_dir
parsers = []
known_parser = set()
for py in [f[:-3] for f in os.listdir(path) if f.endswith('.py') and f != '__init__.py']:
mod = __import__('.'.join([rel_dir.replace("/", "."), py]), fromlist=[py])
classes = [getattr(mod, x) for x in dir(mod) if inspect.isclass(getattr(mod, x))]
for cls in classes:
if issubclass(cls, Parser) and cls not in known_parser:
parsers.append(cls()) # [review] - Would we ever need arguments for the parsers?
known_parser.add(cls)
return parsers
def setup_crawler(searchable, settings):
spider = FourmiSpider(compound=searchable)
spider.add_parsers(load_parsers())
crawler = Crawler(settings)
crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
crawler.configure()
crawler.crawl(spider)
crawler.start()
def scrapy_settings_manipulation(arguments):
settings = get_project_settings()
if arguments["--output"] != 'result.*format*':
settings.overrides["FEED_URI"] = arguments["--output"]
elif arguments["--format"] == "jsonlines":
settings.overrides["FEED_URI"] = "results.json"
elif arguments["--format"] is not None:
settings.overrides["FEED_URI"] = "results." + arguments["--format"]
if arguments["--format"] is not None:
settings.overrides["FEED_FORMAT"] = arguments["--format"]
return settings
def start_log(arguments):
if arguments["--log"] is not None:
if arguments["--verbose"]:
log.start(logfile=arguments["--log"], logstdout=False, loglevel=log.DEBUG)
else:
log.start(logfile=arguments["--log"], logstdout=True, loglevel=log.WARNING)
else:
if arguments["--verbose"]:
log.start(logstdout=False, loglevel=log.DEBUG)
else:
log.start(logstdout=True, loglevel=log.WARNING)
if __name__ == '__main__':
arguments = docopt.docopt(__doc__, version='Fourmi - V0.1.0')
start_log(arguments)
settings = scrapy_settings_manipulation(arguments)
setup_crawler([arguments["<compound>"]], settings)
reactor.run()