From bfa78f4697bb4b49f8855d0ccefef2121abe64f2 Mon Sep 17 00:00:00 2001 From: Bas Vb Date: Tue, 10 Jun 2014 22:30:59 +0200 Subject: [PATCH 01/21] Clean up documentation in Wikipedia parsers --- FourmiCrawler/sources/WikipediaParser.py | 40 ++++++++++++++++++------ 1 file changed, 31 insertions(+), 9 deletions(-) diff --git a/FourmiCrawler/sources/WikipediaParser.py b/FourmiCrawler/sources/WikipediaParser.py index 8722cef..344f836 100644 --- a/FourmiCrawler/sources/WikipediaParser.py +++ b/FourmiCrawler/sources/WikipediaParser.py @@ -1,11 +1,9 @@ -import re - from scrapy.http import Request from scrapy import log -from scrapy.selector import Selector - from source import Source +from scrapy.selector import Selector from FourmiCrawler.items import Result +import re class WikipediaParser(Source): @@ -26,7 +24,11 @@ class WikipediaParser(Source): self.cfg = config def parse(self, response): - """ Distributes the above described behaviour """ + """ + Distributes the above described behaviour + :param response: The incoming search request + :return: Returns the found properties if response is unique or returns none if it's already known + """ log.msg('A response from %s just arrived!' % response.url, level=log.DEBUG) sel = Selector(response) compound = sel.xpath('//h1[@id="firstHeading"]//span/text()').extract()[0] # makes sure to use main page @@ -38,7 +40,14 @@ class WikipediaParser(Source): return items def parse_infobox(self, sel): - """ scrape data from infobox on wikipedia. """ + """ + Scrape data from infobox on wikipedia. + + Data from two types of infoboxes: class="infobox bordered" and class="infobox" is scraped and + :param sel: The selector with the html-information of the page to parse + :return: item_list: Returns a list of properties with their values, source, etc.. + """ + items = [] # be sure to get chembox (wikipedia template) @@ -54,7 +63,7 @@ class WikipediaParser(Source): items.append(item) log.msg('Wiki prop: |%s| |%s| |%s|' % (item['attribute'], item['value'], item['source']), level=log.DEBUG) - #scrape the drugbox (wikipedia template) + #scrape the drugbox (wikipedia template) tr_list2 = sel.xpath('.//table[@class="infobox"]//tr') log.msg('dit: %s' % tr_list2, level=log.DEBUG) for tablerow in tr_list2: @@ -97,7 +106,15 @@ class WikipediaParser(Source): @staticmethod def clean_items(items): - """ clean up properties using regex, makes it possible to split the values from the units """ + + """ + Clean up properties using regex, makes it possible to split the values from the units + + Almost not in use, only cleans J/K/mol values and boiling/melting points. + + :param items: List of properties with their values, source, etc.. + :return: items: List of now cleaned up items + """ for item in items: value = item['value'] m = re.search('F;\s(\d+[\.,]?\d*)', value) # clean up numerical Kelvin value (after F) @@ -110,7 +127,12 @@ class WikipediaParser(Source): @staticmethod def get_identifiers(sel): - """ find external links, named 'Identifiers' to different sources. """ + """ + Find external links, named 'Identifiers' to different sources. + + :param sel: The selector with the html-information of the page to parse + :return: links: New links which can be used to expand the crawlers search + """ links = sel.xpath('//span[contains(concat(" ",normalize-space(@class)," "),"reflink")]/a' '[contains(concat(" ",normalize-space(@class)," "),"external")]/@href').extract() return links From 6621b3028c161e2809058f5fdff755c13eddcc4b Mon Sep 17 00:00:00 2001 From: Bas Vb Date: Tue, 10 Jun 2014 22:32:03 +0200 Subject: [PATCH 02/21] small typography --- FourmiCrawler/sources/WikipediaParser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/FourmiCrawler/sources/WikipediaParser.py b/FourmiCrawler/sources/WikipediaParser.py index 344f836..94dc9d5 100644 --- a/FourmiCrawler/sources/WikipediaParser.py +++ b/FourmiCrawler/sources/WikipediaParser.py @@ -10,7 +10,7 @@ class WikipediaParser(Source): """ Wikipedia scraper for chemical properties This parser parses Wikipedia info boxes (also bordered) to obtain properties and their values. - It also returns requests with other external sources which contain information on parsed subject. + It also returns requests with other external sources which contain information on parsed subject. """ website = "http://en.wikipedia.org/wiki/*" From e9a5fc08e5944b630eaee091663439e29e430ea0 Mon Sep 17 00:00:00 2001 From: Bas Vb Date: Tue, 10 Jun 2014 22:41:32 +0200 Subject: [PATCH 03/21] Splitting up parse function --- FourmiCrawler/sources/WikipediaParser.py | 61 ++++++++++++++---------- 1 file changed, 35 insertions(+), 26 deletions(-) diff --git a/FourmiCrawler/sources/WikipediaParser.py b/FourmiCrawler/sources/WikipediaParser.py index 94dc9d5..6ea222d 100644 --- a/FourmiCrawler/sources/WikipediaParser.py +++ b/FourmiCrawler/sources/WikipediaParser.py @@ -50,34 +50,11 @@ class WikipediaParser(Source): items = [] - # be sure to get chembox (wikipedia template) - tr_list = sel.xpath('.//table[@class="infobox bordered"]//td[not(@colspan)]'). \ - xpath('normalize-space(string())') - prop_names = tr_list[::2] - prop_values = tr_list[1::2] - for i, prop_name in enumerate(prop_names): - item = self.newresult( - attribute=prop_name.extract().encode('utf-8'), - value=prop_values[i].extract().encode('utf-8') - ) - items.append(item) - log.msg('Wiki prop: |%s| |%s| |%s|' % (item['attribute'], item['value'], item['source']), level=log.DEBUG) + # scrape the chembox (wikipedia template) + parse_chembox(sel,items) #scrape the drugbox (wikipedia template) - tr_list2 = sel.xpath('.//table[@class="infobox"]//tr') - log.msg('dit: %s' % tr_list2, level=log.DEBUG) - for tablerow in tr_list2: - log.msg('item: %s' % tablerow.xpath('./th').xpath('normalize-space(string())'), level=log.DEBUG) - if tablerow.xpath('./th').xpath('normalize-space(string())') and tablerow.xpath('./td').xpath( - 'normalize-space(string())'): - item = self.newresult( - attribute=tablerow.xpath('./th').xpath('normalize-space(string())').extract()[0].encode('utf-8'), - value=tablerow.xpath('./td').xpath('normalize-space(string())').extract()[0].encode('utf-8'), - ) - items.append(item) - log.msg( - 'Wiki prop: |attribute: %s| |value: %s| |%s|' % (item['attribute'], item['value'], item['source']), - level=log.DEBUG) + parse_drugbox(sel,items) items = filter(lambda a: a['value'] != '', items) # remove items with an empty value item_list = self.clean_items(items) @@ -101,6 +78,38 @@ class WikipediaParser(Source): return item_list + def parse_chembox(self, sel, items): + tr_list = sel.xpath('.//table[@class="infobox bordered"]//td[not(@colspan)]'). \ + xpath('normalize-space(string())') + prop_names = tr_list[::2] + prop_values = tr_list[1::2] + for i, prop_name in enumerate(prop_names): + item = self.newresult( + attribute=prop_name.extract().encode('utf-8'), + value=prop_values[i].extract().encode('utf-8') + ) + items.append(item) + log.msg('Wiki prop: |%s| |%s| |%s|' % (item['attribute'], item['value'], item['source']), level=log.DEBUG) + return items + + def parse_drugbox(self, sel, items): + tr_list2 = sel.xpath('.//table[@class="infobox"]//tr') + log.msg('dit: %s' % tr_list2, level=log.DEBUG) + for tablerow in tr_list2: + log.msg('item: %s' % tablerow.xpath('./th').xpath('normalize-space(string())'), level=log.DEBUG) + if tablerow.xpath('./th').xpath('normalize-space(string())') and tablerow.xpath('./td').xpath( + 'normalize-space(string())'): + item = self.newresult( + attribute=tablerow.xpath('./th').xpath('normalize-space(string())').extract()[0].encode('utf-8'), + value=tablerow.xpath('./td').xpath('normalize-space(string())').extract()[0].encode('utf-8'), + ) + items.append(item) + log.msg( + 'Wiki prop: |attribute: %s| |value: %s| |%s|' % (item['attribute'], item['value'], item['source']), + level=log.DEBUG) + return items + + def new_compound_request(self, compound): return Request(url=self.website[:-1] + compound, callback=self.parse) From de474fea31d75b0f77fd010c457635f06b034664 Mon Sep 17 00:00:00 2001 From: Bas Vb Date: Tue, 10 Jun 2014 22:42:45 +0200 Subject: [PATCH 04/21] small fixes --- FourmiCrawler/sources/WikipediaParser.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/FourmiCrawler/sources/WikipediaParser.py b/FourmiCrawler/sources/WikipediaParser.py index 6ea222d..38ed836 100644 --- a/FourmiCrawler/sources/WikipediaParser.py +++ b/FourmiCrawler/sources/WikipediaParser.py @@ -51,10 +51,10 @@ class WikipediaParser(Source): items = [] # scrape the chembox (wikipedia template) - parse_chembox(sel,items) + items = self.parse_chembox(sel, items) #scrape the drugbox (wikipedia template) - parse_drugbox(sel,items) + items = self.parse_drugbox(sel, items) items = filter(lambda a: a['value'] != '', items) # remove items with an empty value item_list = self.clean_items(items) From a1859f2ec2a2986b4dc94fccf044e62243193bd0 Mon Sep 17 00:00:00 2001 From: Bas Vb Date: Tue, 10 Jun 2014 22:46:50 +0200 Subject: [PATCH 05/21] final documentation --- FourmiCrawler/sources/WikipediaParser.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/FourmiCrawler/sources/WikipediaParser.py b/FourmiCrawler/sources/WikipediaParser.py index 38ed836..4aa49b2 100644 --- a/FourmiCrawler/sources/WikipediaParser.py +++ b/FourmiCrawler/sources/WikipediaParser.py @@ -79,6 +79,13 @@ class WikipediaParser(Source): return item_list def parse_chembox(self, sel, items): + """ + Scrape data from chembox infobox on wikipedia. + + :param sel: The selector with the html-information of the page to parse + :param items: the list of items where the result have to be stored in + :return: items: the list of items with the new found and stored items + """ tr_list = sel.xpath('.//table[@class="infobox bordered"]//td[not(@colspan)]'). \ xpath('normalize-space(string())') prop_names = tr_list[::2] @@ -93,6 +100,13 @@ class WikipediaParser(Source): return items def parse_drugbox(self, sel, items): + """ + Scrape data from drugbox infobox on wikipedia. + + :param sel: The selector with the html-information of the page to parse + :param items: the list of items where the result have to be stored in + :return: items: the list of items with the new found and stored items + """ tr_list2 = sel.xpath('.//table[@class="infobox"]//tr') log.msg('dit: %s' % tr_list2, level=log.DEBUG) for tablerow in tr_list2: From ee7f1ab739a4b3004635914a02c14baa5b5510b5 Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Sun, 15 Jun 2014 19:26:13 +0200 Subject: [PATCH 06/21] Updated the Objectives and linkage to the wiki --- README.md | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 48b0419..f09f77c 100644 --- a/README.md +++ b/README.md @@ -23,21 +23,21 @@ documentation](http://doc.scrapy.org/en/latest/index.html). ### Installing -If you're installing Fourmi, please take a look at our [installation guide](...) -on our wiki. When you've installed the application, make sure to check our -[usage guide](...). +If you're installing Fourmi, please take a look at our installation guides +on our [wiki](https://github.com/jjdekker/Fourmi/wiki). When you've installed the application, make sure to check our +usage guide on the [Command Line Interface](https://github.com/jjdekker/Fourmi/wiki/CLI) and on the [Graphical User Interface](https://github.com/jjdekker/Fourmi/wiki/GUI). ### Using the Source To use the Fourmi source code multiple dependencies are required. Take a look at -the [wiki page](...) on using the application source code for a step by step +our [wiki pages](https://github.com/jjdekker/Fourmi/wiki) on using the application source code in our a step by step installation guide. When developing for the Fourmi project keep in mind that code readability is a must. To maintain the readability, code should be conform with the [PEP-8](http://legacy.python.org/dev/peps/pep-0008/) style guide for Python code. More information about the different structures and principles of the -Fourmi application can be found on our [wiki](...). +Fourmi application can be found on our [wiki](https://github.com/jjdekker/Fourmi/wiki). ### To Do @@ -45,13 +45,9 @@ The Fourmi project has the following goals for the nearby future: __Main goals:__ -- Improve our documentation and guides. (Assignee: Dekker) - Build an graphical user interface(GUI) as alternative for the command line interface(CLI). (Assignee: Harmen) - Compiling the source into an windows executable. (Assignee: Bas) -- Create an configuration file to hold logins and API keys. -- Determine reliability of our data point. -- Create an module to gather data from NIST. (Assignee: Rob) - Create an module to gather data from PubChem. (Assignee: Nout) __Side goals:__ From 2eb8f3e0af18cad1adafeb8d6e2783b483539c35 Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Sun, 15 Jun 2014 19:38:52 +0200 Subject: [PATCH 07/21] Changed logging CL option --- fourmi.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/fourmi.py b/fourmi.py index e6d7e9a..ab4baef 100755 --- a/fourmi.py +++ b/fourmi.py @@ -5,6 +5,7 @@ Fourmi, a web scraper build to search specific information for a given compound Usage: fourmi search fourmi [options] search + fourmi [-v | -vv | -vvv] [options] search fourmi [options] [--include= | --exclude=] search fourmi list fourmi [--include= | --exclude=] list @@ -15,7 +16,7 @@ Options: --attributes= Include only that match these regular expressions split by a comma. [default: .*] -h --help Show this screen. --version Show version. - --verbose Verbose logging output. + -v Verbose logging output. (Multiple occurrences increase logging level) --log= Save log to an file. -o --output= Output file [default: results.*format*] -f --format= Output formats (supported: csv, json, jsonlines, xml) [default: csv] @@ -25,8 +26,7 @@ Options: from twisted.internet import reactor from scrapy.crawler import Crawler -from scrapy import log, signals -from scrapy.utils.project import get_project_settings +from scrapy import signals import docopt from FourmiCrawler.spider import FourmiSpider @@ -69,6 +69,8 @@ if __name__ == '__main__': arguments = docopt.docopt(__doc__, version='Fourmi - V0.5.0') loader = SourceLoader() + print arguments["-v"] + if arguments["--include"]: loader.include(arguments["--include"].split(',')) elif arguments["--exclude"]: From 4672903c9b9b39a3b64cb3f56e1c5530f89890ae Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Sun, 15 Jun 2014 19:50:31 +0200 Subject: [PATCH 08/21] The logging now using the scrapy setting overrides --- fourmi.py | 4 +--- utils/configurator.py | 39 +++++++++++++++++++++++++++------------ 2 files changed, 28 insertions(+), 15 deletions(-) diff --git a/fourmi.py b/fourmi.py index ab4baef..1fd54e7 100755 --- a/fourmi.py +++ b/fourmi.py @@ -58,7 +58,7 @@ def search(docopt_arguments, source_loader): :param source_loader: An initiated SourceLoader object pointed at the directory with the sources. """ conf = Configurator() - conf.start_log(docopt_arguments["--log"], docopt_arguments["--verbose"]) + conf.start_log(docopt_arguments["--log"], docopt_arguments["-v"]) conf.set_output(docopt_arguments["--output"], docopt_arguments["--format"]) setup_crawler(docopt_arguments[""], conf.scrapy_settings, source_loader, docopt_arguments["--attributes"].split(',')) reactor.run() @@ -69,8 +69,6 @@ if __name__ == '__main__': arguments = docopt.docopt(__doc__, version='Fourmi - V0.5.0') loader = SourceLoader() - print arguments["-v"] - if arguments["--include"]: loader.include(arguments["--include"].split(',')) elif arguments["--exclude"]: diff --git a/utils/configurator.py b/utils/configurator.py index dfc6330..25a4883 100644 --- a/utils/configurator.py +++ b/utils/configurator.py @@ -1,6 +1,8 @@ +import ConfigParser + from scrapy import log from scrapy.utils.project import get_project_settings -import ConfigParser + class Configurator: """ @@ -33,20 +35,33 @@ class Configurator: def start_log(self, logfile, verbose): """ - This function starts the logging functionality of Scrapy using the settings given by the CLI. + This function changes the default settings of Scapy's logging functionality + using the settings given by the CLI. :param logfile: The location where the logfile will be saved. - :param verbose: A boolean value to switch between loglevels. + :param verbose: A integer value to switch between loglevels. """ - if logfile is not None: - if verbose: - log.start(logfile=logfile, logstdout=False, loglevel=log.DEBUG) - else: - log.start(logfile=logfile, logstdout=True, loglevel=log.WARNING) + if verbose != 0: + self.scrapy_settings.overrides["LOG_ENABLED"] = True else: - if verbose: - log.start(logstdout=False, loglevel=log.DEBUG) - else: - log.start(logstdout=True, loglevel=log.WARNING) + self.scrapy_settings.overrides["LOG_ENABLED"] = False + + if verbose == 1: + self.scrapy_settings.overrides["LOG_LEVEL"] = "WARNING" + elif verbose == 2: + self.scrapy_settings.overrides["LOG_LEVEL"] = "INFO" + else: + self.scrapy_settings.overrides["LOG_LEVEL"] = "DEBUG" + + if verbose > 1: + self.scrapy_settings.overrides["LOG_STDOUT"] = False + else: + self.scrapy_settings.overrides["LOG_STDOUT"] = True + + if logfile is not None: + self.scrapy_settings.overrides["LOG_FILE"] = logfile + else: + self.scrapy_settings.overrides["LOG_FILE"] = None + @staticmethod def read_sourceconfiguration(): From f604c3efcc62b39b139651b440ce46761204a0d9 Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Sun, 15 Jun 2014 20:07:11 +0200 Subject: [PATCH 09/21] Utils can't use the logging facilities as they aren't started yet --- utils/configurator.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/utils/configurator.py b/utils/configurator.py index 25a4883..7c1aaa8 100644 --- a/utils/configurator.py +++ b/utils/configurator.py @@ -1,6 +1,5 @@ import ConfigParser -from scrapy import log from scrapy.utils.project import get_project_settings @@ -90,7 +89,6 @@ class Configurator: elif config.defaults(): section = config.defaults() if 'reliability' not in section: - log.msg('Reliability not set for %s' % sourcename, - level=log.WARNING) + print 'Reliability not set for %s' % sourcename section['reliability'] = '' return section From 3ea950b93662d741f2b0d971ba43f3c9804c55eb Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Sun, 15 Jun 2014 20:09:40 +0200 Subject: [PATCH 10/21] Logging facility is working again. --- fourmi.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fourmi.py b/fourmi.py index 1fd54e7..95fc53a 100755 --- a/fourmi.py +++ b/fourmi.py @@ -26,7 +26,7 @@ Options: from twisted.internet import reactor from scrapy.crawler import Crawler -from scrapy import signals +from scrapy import signals, log import docopt from FourmiCrawler.spider import FourmiSpider @@ -61,6 +61,7 @@ def search(docopt_arguments, source_loader): conf.start_log(docopt_arguments["--log"], docopt_arguments["-v"]) conf.set_output(docopt_arguments["--output"], docopt_arguments["--format"]) setup_crawler(docopt_arguments[""], conf.scrapy_settings, source_loader, docopt_arguments["--attributes"].split(',')) + log.start(conf.scrapy_settings.get("LOG_FILE"), conf.scrapy_settings.get("LOG_LEVEL"), conf.scrapy_settings.get("LOG_STDOUT")) reactor.run() From 3fe2cde892ba1889d7d845a71c2e41a8037781be Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Sun, 15 Jun 2014 20:10:17 +0200 Subject: [PATCH 11/21] Error message clearly labeled as a warning --- utils/configurator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/configurator.py b/utils/configurator.py index 7c1aaa8..5cde4d5 100644 --- a/utils/configurator.py +++ b/utils/configurator.py @@ -89,6 +89,6 @@ class Configurator: elif config.defaults(): section = config.defaults() if 'reliability' not in section: - print 'Reliability not set for %s' % sourcename + print 'WARNING: Reliability not set for %s' % sourcename section['reliability'] = '' return section From e3d6087ed43e6c38ab6c156ea9926447e7867028 Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Sun, 15 Jun 2014 20:12:23 +0200 Subject: [PATCH 12/21] renamed logging function --- tests/test_configurator.py | 11 +++++------ utils/configurator.py | 2 +- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/tests/test_configurator.py b/tests/test_configurator.py index eb43cb7..cf54132 100644 --- a/tests/test_configurator.py +++ b/tests/test_configurator.py @@ -1,7 +1,8 @@ import unittest +import ConfigParser + from utils.configurator import Configurator -import ConfigParser class TestConfigurator(unittest.TestCase): @@ -21,11 +22,9 @@ class TestConfigurator(unittest.TestCase): self.assertEqual(self.conf.scrapy_settings["FEED_URI"], "results.csv") self.assertEqual(self.conf.scrapy_settings["FEED_FORMAT"], "csv") - # def test_start_log(self): - # self.conf.start_log("test.log", True) - # self.conf.start_log("test.log", False) - # self.conf.start_log(None, True) - # self.conf.start_log(None, False) + def test_start_log(self): + for i in range(0 ,3): + self.conf.set_logging() def test_read_sourceconfiguration(self): config = self.conf.read_sourceconfiguration() diff --git a/utils/configurator.py b/utils/configurator.py index 5cde4d5..03ef38f 100644 --- a/utils/configurator.py +++ b/utils/configurator.py @@ -32,7 +32,7 @@ class Configurator: self.scrapy_settings.overrides["FEED_FORMAT"] = fileformat - def start_log(self, logfile, verbose): + def set_logging(self, logfile, verbose): """ This function changes the default settings of Scapy's logging functionality using the settings given by the CLI. From 435356c3212e5f6656fd7f560217c398f2a26d16 Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Sun, 15 Jun 2014 20:32:24 +0200 Subject: [PATCH 13/21] Added default values to the logging function --- fourmi.py | 2 +- utils/configurator.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/fourmi.py b/fourmi.py index 95fc53a..1b9237c 100755 --- a/fourmi.py +++ b/fourmi.py @@ -58,7 +58,7 @@ def search(docopt_arguments, source_loader): :param source_loader: An initiated SourceLoader object pointed at the directory with the sources. """ conf = Configurator() - conf.start_log(docopt_arguments["--log"], docopt_arguments["-v"]) + conf.set_logging(docopt_arguments["--log"], docopt_arguments["-v"]) conf.set_output(docopt_arguments["--output"], docopt_arguments["--format"]) setup_crawler(docopt_arguments[""], conf.scrapy_settings, source_loader, docopt_arguments["--attributes"].split(',')) log.start(conf.scrapy_settings.get("LOG_FILE"), conf.scrapy_settings.get("LOG_LEVEL"), conf.scrapy_settings.get("LOG_STDOUT")) diff --git a/utils/configurator.py b/utils/configurator.py index 03ef38f..7dc27c5 100644 --- a/utils/configurator.py +++ b/utils/configurator.py @@ -32,7 +32,7 @@ class Configurator: self.scrapy_settings.overrides["FEED_FORMAT"] = fileformat - def set_logging(self, logfile, verbose): + def set_logging(self, logfile=None, verbose=0): """ This function changes the default settings of Scapy's logging functionality using the settings given by the CLI. From fa42562b8e63bc049cac5a8769b02f7dd72a97c1 Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Sun, 15 Jun 2014 20:33:58 +0200 Subject: [PATCH 14/21] Tests for the Logging Functionality --- tests/test_configurator.py | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/tests/test_configurator.py b/tests/test_configurator.py index cf54132..df29da9 100644 --- a/tests/test_configurator.py +++ b/tests/test_configurator.py @@ -23,8 +23,27 @@ class TestConfigurator(unittest.TestCase): self.assertEqual(self.conf.scrapy_settings["FEED_FORMAT"], "csv") def test_start_log(self): - for i in range(0 ,3): - self.conf.set_logging() + for i in range(0, 3): + self.conf.set_logging("TEST", i) + self.assertEqual(self.conf.scrapy_settings.get("LOG_FILE"), "TEST") + if i > 0: + self.assertEqual(self.conf.scrapy_settings.get("LOG_ENABLED"), True) + if i > 1: + self.assertEqual(self.conf.scrapy_settings.get("LOG_STDOUT"), False) + else: + self.assertEqual(self.conf.scrapy_settings.get("LOG_STDOUT"), True) + else: + self.assertEqual(self.conf.scrapy_settings.get("LOG_ENABLED"), False) + self.assertEqual(self.conf.scrapy_settings.get("LOG_STDOUT"), True) + if i == 1: + self.assertEqual(self.conf.scrapy_settings.get("LOG_LEVEL"), "WARNING") + elif i == 2: + self.assertEqual(self.conf.scrapy_settings.get("LOG_LEVEL"), "INFO") + elif i == 3: + self.assertEqual(self.conf.scrapy_settings.get("LOG_LEVEL"), "DEBUG") + + self.conf.set_logging(verbose=i) + self.assertEqual(self.conf.scrapy_settings.get("LOG_FILE"), None) def test_read_sourceconfiguration(self): config = self.conf.read_sourceconfiguration() From 66f2384747a5a86aba034729f532794e7c06e8fe Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Sun, 15 Jun 2014 20:41:19 +0200 Subject: [PATCH 15/21] Default arguments can't be mutable --- FourmiCrawler/sources/ChemSpider.py | 7 +++++-- FourmiCrawler/sources/NIST.py | 9 +++++---- FourmiCrawler/sources/WikipediaParser.py | 14 +++++++++----- FourmiCrawler/sources/source.py | 2 +- FourmiCrawler/spider.py | 7 +++++-- 5 files changed, 25 insertions(+), 14 deletions(-) diff --git a/FourmiCrawler/sources/ChemSpider.py b/FourmiCrawler/sources/ChemSpider.py index 87a6ee7..fb51a4a 100644 --- a/FourmiCrawler/sources/ChemSpider.py +++ b/FourmiCrawler/sources/ChemSpider.py @@ -26,9 +26,12 @@ class ChemSpider(Source): structure = 'Chemical-Structure.%s.html' extendedinfo = 'MassSpecAPI.asmx/GetExtendedCompoundInfo?csid=%s&token=' - def __init__(self, config={}): + def __init__(self, config=None): Source.__init__(self, config) - self.cfg = config + if self.cfg is None: + self.cfg = {} + else: + self.cfg = config self.ignore_list = [] if 'token' not in self.cfg or self.cfg['token'] == '': log.msg('ChemSpider token not set or empty, search/MassSpec API ' diff --git a/FourmiCrawler/sources/NIST.py b/FourmiCrawler/sources/NIST.py index 3c323ef..d71d08f 100644 --- a/FourmiCrawler/sources/NIST.py +++ b/FourmiCrawler/sources/NIST.py @@ -22,12 +22,13 @@ class NIST(Source): search = 'cgi/cbook.cgi?Name=%s&Units=SI&cTP=on' - cfg = {} - - def __init__(self, config={}): + def __init__(self, config=None): Source.__init__(self, config) self.ignore_list = set() - self.cfg = config + if config is None: + self.cfg = {} + else: + self.cfg = config def parse(self, response): sel = Selector(response) diff --git a/FourmiCrawler/sources/WikipediaParser.py b/FourmiCrawler/sources/WikipediaParser.py index 4aa49b2..b995f30 100644 --- a/FourmiCrawler/sources/WikipediaParser.py +++ b/FourmiCrawler/sources/WikipediaParser.py @@ -1,9 +1,11 @@ +import re + from scrapy.http import Request from scrapy import log -from source import Source from scrapy.selector import Selector + +from source import Source from FourmiCrawler.items import Result -import re class WikipediaParser(Source): @@ -17,11 +19,13 @@ class WikipediaParser(Source): __spider = None searched_compounds = [] - cfg = {} - def __init__(self, config={}): + def __init__(self, config=None): Source.__init__(self, config) - self.cfg = config + if config is None: + self.cfg = {} + else: + self.cfg = config def parse(self, response): """ diff --git a/FourmiCrawler/sources/source.py b/FourmiCrawler/sources/source.py index a609bb9..fe36784 100644 --- a/FourmiCrawler/sources/source.py +++ b/FourmiCrawler/sources/source.py @@ -6,7 +6,7 @@ class Source: website = "http://something/*" # Regex of URI's the source is able to parse _spider = None - def __init__(self, config={}): + def __init__(self, config=None): """ Initiation of a new Source """ diff --git a/FourmiCrawler/spider.py b/FourmiCrawler/spider.py index 5c09f07..7552c7d 100644 --- a/FourmiCrawler/spider.py +++ b/FourmiCrawler/spider.py @@ -10,7 +10,7 @@ class FourmiSpider(Spider): """ name = "FourmiSpider" - def __init__(self, compound=None, selected_attributes=[".*"], *args, **kwargs): + def __init__(self, compound=None, selected_attributes=None, *args, **kwargs): """ Initiation of the Spider :param compound: compound that will be searched. @@ -20,7 +20,10 @@ class FourmiSpider(Spider): self.synonyms = set() super(FourmiSpider, self).__init__(*args, **kwargs) self.synonyms.add(compound) - self.selected_attributes = selected_attributes + if selected_attributes is None: + self.selected_attributes = [".*"] + else: + self.selected_attributes = selected_attributes def parse(self, response): """ From 74e7152d5fc2d35ad109f2660bb2385cdc04526d Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Sun, 15 Jun 2014 20:45:35 +0200 Subject: [PATCH 16/21] A lot of PEP-8 fixes --- FourmiCrawler/sources/ChemSpider.py | 6 +++--- FourmiCrawler/sources/NIST.py | 14 +++++++------- FourmiCrawler/sources/WikipediaParser.py | 17 ++++++++--------- FourmiCrawler/spider.py | 2 +- fourmi.py | 6 ++++-- tests/test_spider.py | 1 - utils/configurator.py | 5 +---- utils/sourceloader.py | 1 + 8 files changed, 25 insertions(+), 27 deletions(-) diff --git a/FourmiCrawler/sources/ChemSpider.py b/FourmiCrawler/sources/ChemSpider.py index fb51a4a..3f1538f 100644 --- a/FourmiCrawler/sources/ChemSpider.py +++ b/FourmiCrawler/sources/ChemSpider.py @@ -40,7 +40,6 @@ class ChemSpider(Source): self.search += self.cfg['token'] self.extendedinfo += self.cfg['token'] - def parse(self, response): sel = Selector(response) requests = [] @@ -202,13 +201,14 @@ class ChemSpider(Source): return properties def newresult(self, attribute, value, conditions='', source='ChemSpider'): - return Result({ + return Result( + { 'attribute': attribute, 'value': value, 'source': source, 'reliability': self.cfg['reliability'], 'conditions': conditions - }) + }) def parse_searchrequest(self, response): """Parse the initial response of the ChemSpider Search API """ diff --git a/FourmiCrawler/sources/NIST.py b/FourmiCrawler/sources/NIST.py index d71d08f..e81db5a 100644 --- a/FourmiCrawler/sources/NIST.py +++ b/FourmiCrawler/sources/NIST.py @@ -89,7 +89,6 @@ class NIST(Source): InChiKey, CAS number """ ul = sel.xpath('body/ul[li/strong="IUPAC Standard InChI:"]') - li = ul.xpath('li') raw_synonyms = ul.xpath('li[strong="Other names:"]/text()').extract() for synonym in raw_synonyms[0].strip().split(';\n'): @@ -256,12 +255,13 @@ class NIST(Source): return results def newresult(self, attribute, value, conditions=''): - return Result({ - 'attribute': attribute, - 'value': value, - 'source': 'NIST', - 'reliability': self.cfg['reliability'], - 'conditions': conditions + return Result( + { + 'attribute': attribute, + 'value': value, + 'source': 'NIST', + 'reliability': self.cfg['reliability'], + 'conditions': conditions }) def new_compound_request(self, compound): diff --git a/FourmiCrawler/sources/WikipediaParser.py b/FourmiCrawler/sources/WikipediaParser.py index b995f30..cfd2555 100644 --- a/FourmiCrawler/sources/WikipediaParser.py +++ b/FourmiCrawler/sources/WikipediaParser.py @@ -19,7 +19,6 @@ class WikipediaParser(Source): __spider = None searched_compounds = [] - def __init__(self, config=None): Source.__init__(self, config) if config is None: @@ -57,7 +56,7 @@ class WikipediaParser(Source): # scrape the chembox (wikipedia template) items = self.parse_chembox(sel, items) - #scrape the drugbox (wikipedia template) + # scrape the drugbox (wikipedia template) items = self.parse_drugbox(sel, items) items = filter(lambda a: a['value'] != '', items) # remove items with an empty value @@ -127,7 +126,6 @@ class WikipediaParser(Source): level=log.DEBUG) return items - def new_compound_request(self, compound): return Request(url=self.website[:-1] + compound, callback=self.parse) @@ -165,10 +163,11 @@ class WikipediaParser(Source): return links def newresult(self, attribute, value): - return Result({ - 'attribute': attribute, - 'value': value, - 'source': 'Wikipedia', - 'reliability': self.cfg['reliability'], - 'conditions': '' + return Result( + { + 'attribute': attribute, + 'value': value, + 'source': 'Wikipedia', + 'reliability': self.cfg['reliability'], + 'conditions': '' }) diff --git a/FourmiCrawler/spider.py b/FourmiCrawler/spider.py index 7552c7d..ebfd2cf 100644 --- a/FourmiCrawler/spider.py +++ b/FourmiCrawler/spider.py @@ -21,7 +21,7 @@ class FourmiSpider(Spider): super(FourmiSpider, self).__init__(*args, **kwargs) self.synonyms.add(compound) if selected_attributes is None: - self.selected_attributes = [".*"] + self.selected_attributes = [".*"] else: self.selected_attributes = selected_attributes diff --git a/fourmi.py b/fourmi.py index 1b9237c..2a422ef 100755 --- a/fourmi.py +++ b/fourmi.py @@ -60,8 +60,10 @@ def search(docopt_arguments, source_loader): conf = Configurator() conf.set_logging(docopt_arguments["--log"], docopt_arguments["-v"]) conf.set_output(docopt_arguments["--output"], docopt_arguments["--format"]) - setup_crawler(docopt_arguments[""], conf.scrapy_settings, source_loader, docopt_arguments["--attributes"].split(',')) - log.start(conf.scrapy_settings.get("LOG_FILE"), conf.scrapy_settings.get("LOG_LEVEL"), conf.scrapy_settings.get("LOG_STDOUT")) + setup_crawler(docopt_arguments[""], conf.scrapy_settings, + source_loader, docopt_arguments["--attributes"].split(',')) + log.start(conf.scrapy_settings.get("LOG_FILE"), + conf.scrapy_settings.get("LOG_LEVEL"), conf.scrapy_settings.get("LOG_STDOUT")) reactor.run() diff --git a/tests/test_spider.py b/tests/test_spider.py index 589a571..1ee40b1 100644 --- a/tests/test_spider.py +++ b/tests/test_spider.py @@ -47,7 +47,6 @@ class TestFoumiSpider(unittest.TestCase): self.assertGreater(len(requests), 0) self.assertIsInstance(requests[0], Request) - def test_synonym_requests(self): # A test for the synonym request function self.spi._sources = [] diff --git a/utils/configurator.py b/utils/configurator.py index 7dc27c5..62987c6 100644 --- a/utils/configurator.py +++ b/utils/configurator.py @@ -12,7 +12,6 @@ class Configurator: def __init__(self): self.scrapy_settings = get_project_settings() - def set_output(self, filename, fileformat): """ This function manipulates the Scrapy output file settings that normally would be set in the settings file. @@ -31,7 +30,6 @@ class Configurator: if fileformat is not None: self.scrapy_settings.overrides["FEED_FORMAT"] = fileformat - def set_logging(self, logfile=None, verbose=0): """ This function changes the default settings of Scapy's logging functionality @@ -61,7 +59,6 @@ class Configurator: else: self.scrapy_settings.overrides["LOG_FILE"] = None - @staticmethod def read_sourceconfiguration(): """ @@ -70,7 +67,7 @@ class Configurator: :return a ConfigParser object of sources.cfg """ config = ConfigParser.ConfigParser() - config.read('sources.cfg') # [TODO]: should be softcoded eventually + config.read('sources.cfg') # [TODO]: should be softcoded eventually return config @staticmethod diff --git a/utils/sourceloader.py b/utils/sourceloader.py index 9b33657..8c54464 100644 --- a/utils/sourceloader.py +++ b/utils/sourceloader.py @@ -5,6 +5,7 @@ import re from FourmiCrawler.sources.source import Source from utils.configurator import Configurator + class SourceLoader: sources = [] From 79cf15b95c30cd937a9394df92ed87a3a635c07e Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Sun, 15 Jun 2014 20:50:00 +0200 Subject: [PATCH 17/21] Refractoring double code --- FourmiCrawler/sources/ChemSpider.py | 4 ---- FourmiCrawler/sources/NIST.py | 4 ---- FourmiCrawler/sources/WikipediaParser.py | 4 ---- FourmiCrawler/sources/source.py | 3 +++ 4 files changed, 3 insertions(+), 12 deletions(-) diff --git a/FourmiCrawler/sources/ChemSpider.py b/FourmiCrawler/sources/ChemSpider.py index 3f1538f..0110e57 100644 --- a/FourmiCrawler/sources/ChemSpider.py +++ b/FourmiCrawler/sources/ChemSpider.py @@ -28,10 +28,6 @@ class ChemSpider(Source): def __init__(self, config=None): Source.__init__(self, config) - if self.cfg is None: - self.cfg = {} - else: - self.cfg = config self.ignore_list = [] if 'token' not in self.cfg or self.cfg['token'] == '': log.msg('ChemSpider token not set or empty, search/MassSpec API ' diff --git a/FourmiCrawler/sources/NIST.py b/FourmiCrawler/sources/NIST.py index e81db5a..934b457 100644 --- a/FourmiCrawler/sources/NIST.py +++ b/FourmiCrawler/sources/NIST.py @@ -25,10 +25,6 @@ class NIST(Source): def __init__(self, config=None): Source.__init__(self, config) self.ignore_list = set() - if config is None: - self.cfg = {} - else: - self.cfg = config def parse(self, response): sel = Selector(response) diff --git a/FourmiCrawler/sources/WikipediaParser.py b/FourmiCrawler/sources/WikipediaParser.py index cfd2555..401698c 100644 --- a/FourmiCrawler/sources/WikipediaParser.py +++ b/FourmiCrawler/sources/WikipediaParser.py @@ -21,10 +21,6 @@ class WikipediaParser(Source): def __init__(self, config=None): Source.__init__(self, config) - if config is None: - self.cfg = {} - else: - self.cfg = config def parse(self, response): """ diff --git a/FourmiCrawler/sources/source.py b/FourmiCrawler/sources/source.py index fe36784..36218b0 100644 --- a/FourmiCrawler/sources/source.py +++ b/FourmiCrawler/sources/source.py @@ -10,6 +10,9 @@ class Source: """ Initiation of a new Source """ + self.cfg = {} + if config is not None: + self.cfg = config pass def parse(self, response): From 147b148dbdfa102de5b1b6d002480cb6acfca39d Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Sun, 15 Jun 2014 21:00:36 +0200 Subject: [PATCH 18/21] Force a attribute of the test item to be None --- tests/test_pipeline.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py index dfb8e83..eb2b070 100644 --- a/tests/test_pipeline.py +++ b/tests/test_pipeline.py @@ -13,6 +13,7 @@ class TestPipelines(unittest.TestCase): def test_none_pipeline(self): # Testing the pipeline that replaces the None values in items. self.testItem["value"] = "abc" + self.testItem["source"] = None pipe = pipelines.RemoveNonePipeline() processed = pipe.process_item(self.testItem, spider.FourmiSpider()) From a27e1e4bdd30b402a2c0ec99f8556c777bf57197 Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Sun, 15 Jun 2014 21:09:43 +0200 Subject: [PATCH 19/21] Bumped version number --- fourmi.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fourmi.py b/fourmi.py index 2a422ef..55a3c20 100755 --- a/fourmi.py +++ b/fourmi.py @@ -69,7 +69,7 @@ def search(docopt_arguments, source_loader): # The start for the Fourmi Command Line interface. if __name__ == '__main__': - arguments = docopt.docopt(__doc__, version='Fourmi - V0.5.0') + arguments = docopt.docopt(__doc__, version='Fourmi - V0.5.1') loader = SourceLoader() if arguments["--include"]: From 5f3ade8ff9f29d36e86a6e5d6b598cdc9870d60e Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Sun, 15 Jun 2014 21:11:30 +0200 Subject: [PATCH 20/21] Added a changelog --- Changelog.md | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 Changelog.md diff --git a/Changelog.md b/Changelog.md new file mode 100644 index 0000000..2a63786 --- /dev/null +++ b/Changelog.md @@ -0,0 +1,3 @@ +### v0.5.1 +- UPDATED: Logging functionality from command line +- DEV: Code cleanup and extra tests \ No newline at end of file From 9c9aba55d8a32ba716d83ad7aa1f5816db61fe63 Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Sun, 15 Jun 2014 21:12:19 +0200 Subject: [PATCH 21/21] Added my signature, confirming validity of current files --- SIGNED.md | 103 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 103 insertions(+) create mode 100644 SIGNED.md diff --git a/SIGNED.md b/SIGNED.md new file mode 100644 index 0000000..79e66cb --- /dev/null +++ b/SIGNED.md @@ -0,0 +1,103 @@ +##### Signed by https://keybase.io/jdekker +``` +-----BEGIN PGP SIGNATURE----- +Version: GnuPG v1.4.11 (GNU/Linux) + +iQIcBAABAgAGBQJTnfAAAAoJEJrQ9RIUCT6/KZIQAME07yzAG5hnqsQof5ESoeQs +5wBxAhiBIX/0yn3qIT/eMh0ubCKUZsqJ3/PzUljeMJ6CGtwxFYfTWkgjYlOoAz9G +fS7CjPmRPyiu+MFo5he+oVRmLUMqfuLUrCyuIxJwMXq5YbQvzyqiffvxr8VRULtV +3c0drWfQMX1ZeAWSIYN0xuMndzvaqIAQU6o4tSQf/rUiKlM2NnTDNUHu2PY9FED/ +IJwM/IgAMAkJARyL7ltq6pHzORsu7sd2Nhv0esa0Gs2GSuRjKueeMZvJzpDAufy9 +bWn9EqKhVwPR6zWnXRmNj9Ymj1w167hIUYcBdFhC7kie5zv9+pDE6d/s7pw/Rejd +L0k8LKBGtJ8o7SKYR9kcNLDWXEnHjfCraD+14FMYqQPcz2ekoV6Exv/mP8qRPwUc +b+FtjJtW8fEiOMAyjMOvLTzYbCVwjdErAqgNdHeSByi1nxfrphjajRiNUt7fVimJ +++QZzKCj6xN2MuTJ41KbZ8teiUXwQB4OKKij0fgoy0RBwW0vqH6MF7cCKm1zT1Qa +9FGlBU2jSybQqUu4lJ/eUjO/3tQMhJErQJU/i+6lwi7OMnS9J/g17Heghp5Hxyhc +VWvhR56pbWLIL2XQqDGGEqPDIzXohHnbRJ1N71b06akIvIIrTqc6Glu4PJeUG/Pe +EF8/jBwydxbKUOyKRSQS +=xWbc +-----END PGP SIGNATURE----- + +``` + + + +### Begin signed statement + +#### Expect + +``` +size exec file contents + ./ +17591 .coverage 1dd1207846db74e407d3a4a1951b8e81934a4693385d39f6c337a224375bad39|1b7ead09cf213b5a9545557be982aaa30238b689bb54adf604f82b12ef521eb2 +375 .gitignore d2e475a6a4fa51422cac0a07495914e776858fb9ab9c8937a4d491a3e042d6b1 +464 .travis.yml 3063ba078607b8d16bd6467afc15fbbaa4b26c1e30be5ce7cef453cfccbaa95c +97 Changelog.md bcbce9a33bbbbcd18fd7788e6dc3a9c4b13dff7128ea99968994c1b290ddc931 + FourmiCrawler/ +0 __init__.py e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855 +304 items.py b00d49a3d53fa13306c7f8b023adb93ab88423c4fce46600689814f6b02bb806 +2178 pipelines.py f9b7b84938060751e15e45de5133dffe50c798bff2a20019206fe7c9d677ad49 +716 settings.py 37a8f63e123bccc77076d574617a522b30c1d7c5e893ec3d78cc40e1563dd8a6 + sources/ +9991 ChemSpider.py 847013e34c5c3683ec66a337837287512b4bab9fbea2ece12e4130ab0dbf264d +9898 NIST.py 97abc84fce85c47b789822715a1945ab84cc052a32340c861141c1af66bab644 +6907 WikipediaParser.py 5d6de911c773129a34b76c40a9b547aafc67644a15f39cd0be6afc7a16fb0f97 +0 __init__.py e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855 +1262 source.py 16c4cdfca849b7dc2bc89d7a6f7ad021f4aa1d04234394312f1d0edf0fd9c5a4 +3026 spider.py 1ffba2512988b7a6b535a4a31a4ef688ece4f8c595c3d50355c34ef46b23e44a +1081 LICENSE 36951e5f1910bad3e008ab7228f35ad8933192e52d3c3ae6a5e875765e27192c +3965 README.md d21236d6a175be28ef8e2fee8a256e95b6a513163e3f1071c26c62e9093db7f3 +3659 x fourmi.py 81781ed7299e447e6fc551fba69e62cd7a1d63f27dfa063927f4c5c10f5ac331 +200850 log.txt d76e741f9e7b67c2574e9cdbbe499ea4861f6e0bd11e5962fdaf9d8720effef8 +184692 results.csv 31132f7f394babeb5dfd249aaa714756017b2e1b314b6715f57e6ad9524e5be8|d0bb724f6d714ec7a4a1ad2052f70dd4510b5ac08d616e24b5e9a903dedab586 +261 scrapy.cfg 624c068fd06303daa65b8e0d0d3ef88ac1f123be2694ef5b4f3f9a9dcd983f85 + tests/ +1 __init__.py 01ba4719c80b6fe911b091a7c05124b64eeece964e09c058ef8f9805daca546b +2837 test_configurator.py 4a0eb6e7121eb09a63ab5cb797570d1a42080c5346c3b8b365da56eefa599e80 +1892 test_pipeline.py 387a336b0f36722a20e712aa033e5771c44f9e92561dd73acffd53d622c52031 +1260 test_sourceloader.py b108b4b80adcdb7401273a9823b1f1a19eb5178776186eb5a9976aed8b1ee869 +2113 test_spider.py 300f280377b522737be0d8e4a80031ab118a4011bdbb92131e9c400fcdab6299 + utils/ +0 __init__.py e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855 +3552 configurator.py e2b7e0ee6c1fef4373785dfe5df8ec6950f31ce6a5d9632b69a66ea3d1eaf921 +2537 sourceloader.py f5a5ac2a6aba0658dbe11361f465caabcf3c06c5c8dc9a631874211cc19d2d37 +``` + +#### Ignore + +``` +/SIGNED.md +``` + +#### Presets + +``` +git # ignore .git and anything as described by .gitignore files +dropbox # ignore .dropbox-cache and other Dropbox-related files +kb # ignore anything as described by .kbignore files +``` + + + +### End signed statement + +
+ +#### Notes + +With keybase you can sign any directory's contents, whether it's a git repo, +source code distribution, or a personal documents folder. It aims to replace the drudgery of: + + 1. comparing a zipped file to a detached statement + 2. downloading a public key + 3. confirming it is in fact the author's by reviewing public statements they've made, using it + +All in one simple command: + +```bash +keybase dir verify +``` + +There are lots of options, including assertions for automating your checks. + +For more info, check out https://keybase.io/docs/command_line/code_signing \ No newline at end of file