diff --git a/fourmi.py b/fourmi.py index efa4e54..c09087d 100755 --- a/fourmi.py +++ b/fourmi.py @@ -34,6 +34,13 @@ from sourceloader import SourceLoader def setup_crawler(searchable, settings, source_loader, attributes): + """ + This function prepares and start the crawler which starts the actual search on the internet + :param searchable: The compound which should be searched + :param settings: A scrapy settings object + :param source_loader: A fully functional SourceLoader object which contains only the sources that should be used. + :param attributes: A list of regular expressions which the attribute names should match. + """ spider = FourmiSpider(compound=searchable, selected_attributes=attributes) spider.add_parsers(source_loader.sources) crawler = Crawler(settings) @@ -44,8 +51,13 @@ def setup_crawler(searchable, settings, source_loader, attributes): def scrapy_settings_manipulation(docopt_arguments): + """ + This function manipulates the Scrapy settings that normally would be set in the settings file. In the Fourmi + project these are command line arguments. + :param docopt_arguments: A dictionary generated by docopt containing all CLI arguments. + """ settings = get_project_settings() - # [todo] - add at least a warning for files that already exist + if docopt_arguments["--output"] != 'result.*format*': settings.overrides["FEED_URI"] = docopt_arguments["--output"] elif docopt_arguments["--format"] == "jsonlines": @@ -60,6 +72,10 @@ def scrapy_settings_manipulation(docopt_arguments): def start_log(docopt_arguments): + """ + This function starts the logging functionality of Scrapy using the settings given by the CLI. + :param docopt_arguments: A dictionary generated by docopt containing all CLI arguments. + """ if docopt_arguments["--log"] is not None: if docopt_arguments["--verbose"]: log.start(logfile=docopt_arguments["--log"], logstdout=False, loglevel=log.DEBUG) @@ -73,12 +89,18 @@ def start_log(docopt_arguments): def search(docopt_arguments, source_loader): + """ + The function that facilitates the search for a specific compound. + :param docopt_arguments: A dictionary generated by docopt containing all CLI arguments. + :param source_loader: An initiated SourceLoader object pointed at the directory with the sources. + """ start_log(docopt_arguments) settings = scrapy_settings_manipulation(docopt_arguments) setup_crawler(docopt_arguments[""], settings, source_loader, docopt_arguments["--attributes"].split(',')) reactor.run() +# The start for the Fourmi Command Line interface. if __name__ == '__main__': arguments = docopt.docopt(__doc__, version='Fourmi - V0.3.0') loader = SourceLoader()