Merge branch 'release/v0.2.5'

2014-04-23 16:09:35 +02:00 · 2014-04-23 16:09:35 +02:00 · 8747910416
commit 8747910416
parent 23189499e6 68a1fe3717
7 changed files with 109 additions and 47 deletions
--- a/FourmiCrawler/sources/ChemSpider.py
+++ b/FourmiCrawler/sources/ChemSpider.py
@ -1,4 +1,4 @@
-from parser import Parser
+from source import Source
 from scrapy import log
 from scrapy.http import Request
 from scrapy.selector import Selector
@ -8,7 +8,7 @@ import re
 # [TODO] - Maybe clean up usage of '.extract()[0]', because of possible IndexError exception.


-class ChemSpider(Parser):
+class ChemSpider(Source):
    """ChemSpider scraper for synonyms and properties

    This parser will manage searching for chemicals through the
@ -18,7 +18,7 @@ class ChemSpider(Parser):
    """

    def __init__(self):
-        pass
+        Source.__init__(self)

    website = 'http://www.chemspider.com/*'

--- a/FourmiCrawler/sources/init.py
+++ b/FourmiCrawler/sources/init.py
--- a/FourmiCrawler/sources/source.py
+++ b/FourmiCrawler/sources/source.py
@ -2,7 +2,7 @@ from scrapy import log
 # from scrapy.http import Request


-class Parser:
+class Source:
    website = "http://something/*"  # Regex of URI's the source is able to parse
    _spider = None

--- a/FourmiCrawler/spider.py
+++ b/FourmiCrawler/spider.py
@ -15,7 +15,7 @@ class FourmiSpider(Spider):
    def parse(self, reponse):
        for parser in self.__parsers:
            if re.match(parser.website, reponse.url):
-                log.msg("Url: " + reponse.url + " -> Parser: " + parser.website, level=log.DEBUG)
+                log.msg("Url: " + reponse.url + " -> Source: " + parser.website, level=log.DEBUG)
                return parser.parse(reponse)
        return None

--- a/21
+++ b/21
@ -0,0 +1,21 @@
+The MIT License (MIT)
+
+Copyright (c) 2014 Ivo B. Rietveld
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
--- a/fourmi.py
+++ b/fourmi.py
@ -5,6 +5,9 @@ Fourmi, a web scraper build to search specific information for a given compound
 Usage:
    fourmi search <compound>
    fourmi [options] search <compound>
+    fourmi [options] [--include=<sourcename> | --exclude=<sourcename>] search <compound>
+    fourmi list
+    fourmi [--include=<sourcename> | --exclude=<sourcename>] list
    fourmi -h | --help
    fourmi --version

@ -15,40 +18,23 @@ Options:
    --log=<file>                    Save log to an file.
    -o <file> --output=<file>       Output file [default: result.*format*]
    -f <format> --format=<format>   Output formats (supported: csv, json, jsonlines, xml) [default: jsonlines]
+    --include=<sourcenames>         Include only sources that match the regular these expressions split by a comma.
+    --exclude=<sourcenames>         Exclude the sources that match the regular these expressions split by a comma.
 """

-import os
-import inspect
-
 from twisted.internet import reactor
 from scrapy.crawler import Crawler
 from scrapy import log, signals
 from scrapy.utils.project import get_project_settings
 import docopt

-from FourmiCrawler.parsers.parser import Parser
 from FourmiCrawler.spider import FourmiSpider
+from sourceloader import SourceLoader


-def load_parsers(rel_dir="FourmiCrawler/parsers"):
-    path = os.path.dirname(os.path.abspath(__file__))
-    path += "/" + rel_dir
-    parsers = []
-    known_parser = set()
-
-    for py in [f[:-3] for f in os.listdir(path) if f.endswith('.py') and f != '__init__.py']:
-        mod = __import__('.'.join([rel_dir.replace("/", "."), py]), fromlist=[py])
-        classes = [getattr(mod, x) for x in dir(mod) if inspect.isclass(getattr(mod, x))]
-        for cls in classes:
-            if issubclass(cls, Parser) and cls not in known_parser:
-                parsers.append(cls())  # [review] - Would we ever need arguments for the parsers?
-                known_parser.add(cls)
-    return parsers
-
-
-def setup_crawler(searchable, settings):
+def setup_crawler(searchable, settings, source_loader):
    spider = FourmiSpider(compound=searchable)
-    spider.add_parsers(load_parsers())
+    spider.add_parsers(source_loader.sources)
    crawler = Crawler(settings)
    crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
    crawler.configure()
@ -56,39 +42,53 @@ def setup_crawler(searchable, settings):
    crawler.start()


-def scrapy_settings_manipulation(arguments):
+def scrapy_settings_manipulation(docopt_arguments):
    settings = get_project_settings()
-
-    if arguments["--output"] != 'result.*format*':
-        settings.overrides["FEED_URI"] = arguments["--output"]
-    elif arguments["--format"] == "jsonlines":
+    # [todo] - add at least a warning for files that already exist
+    if docopt_arguments["--output"] != 'result.*format*':
+        settings.overrides["FEED_URI"] = docopt_arguments["--output"]
+    elif docopt_arguments["--format"] == "jsonlines":
        settings.overrides["FEED_URI"] = "results.json"
-    elif arguments["--format"] is not None:
-        settings.overrides["FEED_URI"] = "results." + arguments["--format"]
+    elif docopt_arguments["--format"] is not None:
+        settings.overrides["FEED_URI"] = "results." + docopt_arguments["--format"]

-    if arguments["--format"] is not None:
-        settings.overrides["FEED_FORMAT"] = arguments["--format"]
+    if docopt_arguments["--format"] is not None:
+        settings.overrides["FEED_FORMAT"] = docopt_arguments["--format"]

    return settings


-def start_log(arguments):
-    if arguments["--log"] is not None:
-        if arguments["--verbose"]:
-            log.start(logfile=arguments["--log"], logstdout=False, loglevel=log.DEBUG)
+def start_log(docopt_arguments):
+    if docopt_arguments["--log"] is not None:
+        if docopt_arguments["--verbose"]:
+            log.start(logfile=docopt_arguments["--log"], logstdout=False, loglevel=log.DEBUG)
        else:
-            log.start(logfile=arguments["--log"], logstdout=True, loglevel=log.WARNING)
+            log.start(logfile=docopt_arguments["--log"], logstdout=True, loglevel=log.WARNING)
    else:
-        if arguments["--verbose"]:
+        if docopt_arguments["--verbose"]:
            log.start(logstdout=False, loglevel=log.DEBUG)
        else:
            log.start(logstdout=True, loglevel=log.WARNING)


-if __name__ == '__main__':
-    arguments = docopt.docopt(__doc__, version='Fourmi - V0.2.0')
-    start_log(arguments)
-    settings = scrapy_settings_manipulation(arguments)
-    setup_crawler(arguments["<compound>"], settings)
+def search(docopt_arguments, source_loader):
+    start_log(docopt_arguments)
+    settings = scrapy_settings_manipulation(docopt_arguments)
+    setup_crawler(docopt_arguments["<compound>"], settings, source_loader)
    reactor.run()

+
+if __name__ == '__main__':
+    arguments = docopt.docopt(__doc__, version='Fourmi - V0.2.5')
+    loader = SourceLoader()
+
+    if arguments["--include"]:
+        loader.include(arguments["--include"].split(','))
+    elif arguments["--exclude"]:
+        loader.exclude(arguments["--exclude"].split(','))
+
+    if arguments["search"]:
+        search(arguments, loader)
+    elif arguments["list"]:
+        print "-== Available Sources ==-"
+        print str(loader)
--- a/sourceloader.py
+++ b/sourceloader.py
@ -0,0 +1,41 @@
+import inspect
+import os
+import re
+from FourmiCrawler.sources.source import Source
+
+
+class SourceLoader:
+    sources = []
+
+    def __init__(self, rel_dir="FourmiCrawler/sources"):
+        path = os.path.dirname(os.path.abspath(__file__))
+        path += "/" + rel_dir
+        known_parser = set()
+
+        for py in [f[:-3] for f in os.listdir(path) if f.endswith('.py') and f != '__init__.py']:
+            mod = __import__('.'.join([rel_dir.replace("/", "."), py]), fromlist=[py])
+            classes = [getattr(mod, x) for x in dir(mod) if inspect.isclass(getattr(mod, x))]
+            for cls in classes:
+                if issubclass(cls, Source) and cls not in known_parser:
+                    self.sources.append(cls())  # [review] - Would we ever need arguments for the parsers?
+                    known_parser.add(cls)
+
+    def include(self, source_names):
+        new = set()
+        for name in source_names:
+            new.update([src for src in self.sources if re.match(name, src.__class__.__name__)])
+        self.sources = list(new)
+
+    def exclude(self, source_names):
+        exclude = []
+        for name in source_names:
+            exclude.extend([src for src in self.sources if re.match(name, src.__class__.__name__)])
+        self.sources = [src for src in self.sources if src not in exclude]
+
+    def __str__(self):
+        string = ""
+        for src in self.sources:
+            string += "Source: " + src.__class__.__name__
+            string += " - "
+            string += "URI: " + src.website + "\n"
+        return string