diff --git a/FourmiCrawler/sources/ChemSpider.py b/FourmiCrawler/sources/ChemSpider.py index b4bf6f0..e95d067 100644 --- a/FourmiCrawler/sources/ChemSpider.py +++ b/FourmiCrawler/sources/ChemSpider.py @@ -89,7 +89,7 @@ class ChemSpider(Source): # Test for properties without values, with one hardcoded exception if (not re.match(r'^\d', prop_value) or - (prop_name == 'Polarizability' and prop_value == '10-24cm3')): + (prop_name == 'Polarizability' and prop_value == '10-24cm3')): continue m = re.match(r'(.*) \((.*)\)', prop_name) @@ -122,12 +122,12 @@ class ChemSpider(Source): properties = [] scraped_list = sel.xpath('.//li[span="Experimental Physico-chemical ' - 'Properties"]//li/table/tr/td') + 'Properties"]//li/table/tr/td') if not scraped_list: return properties # Format is: property name followed by a list of values property_name = scraped_list.pop(0).xpath( - 'span/text()').extract()[0].rstrip() + 'span/text()').extract()[0].rstrip() for line in scraped_list: if line.xpath('span/text()'): property_name = line.xpath('span/text()').extract()[0].rstrip() @@ -251,12 +251,12 @@ class ChemSpider(Source): :return: A Result item """ return Result({ - 'attribute': attribute, - 'value': value, - 'source': source, - 'reliability': self.cfg['reliability'], - 'conditions': conditions - }) + 'attribute': attribute, + 'value': value, + 'source': source, + 'reliability': self.cfg['reliability'], + 'conditions': conditions + }) def parse_searchrequest(self, response): """ diff --git a/FourmiCrawler/sources/NIST.py b/FourmiCrawler/sources/NIST.py index 691b062..52f1332 100644 --- a/FourmiCrawler/sources/NIST.py +++ b/FourmiCrawler/sources/NIST.py @@ -313,12 +313,13 @@ class NIST(Source): :param conditions: optional conditions regarding the value :return: A Result item """ - return Result({ - 'attribute': attribute, - 'value': value, - 'source': 'NIST', - 'reliability': self.cfg['reliability'], - 'conditions': conditions + return Result( + { + 'attribute': attribute, + 'value': value, + 'source': 'NIST', + 'reliability': self.cfg['reliability'], + 'conditions': conditions }) def new_compound_request(self, compound): diff --git a/FourmiCrawler/sources/PubChem.py b/FourmiCrawler/sources/PubChem.py index 15fa3f9..4cd5304 100644 --- a/FourmiCrawler/sources/PubChem.py +++ b/FourmiCrawler/sources/PubChem.py @@ -15,7 +15,7 @@ class PubChem(Source): including sources of the values of properties. """ - #PubChem has its data on compound name, properties and their values on different html pages, so different URLs used + # PubChem has its data on compound name, properties and their values on different html pages, so different URLs used website = 'http://.*\\.ncbi\\.nlm\\.nih\\.gov/.*' website_www = 'http://www.ncbi.nlm.nih.gov/*' website_pubchem = 'http://pubchem.ncbi.nlm.nih.gov/.*' @@ -54,14 +54,16 @@ class PubChem(Source): n = re.search(r'cid=(\d+)', response.url) if n: cid = n.group(1) - log.msg('cid: %s' % cid, level=log.DEBUG) #getting the right id of the compound with which it can reach - # the seperate html page which contains the properties and their values + log.msg('cid: %s' % cid, level=log.DEBUG) # getting the right id of the compound with which it can reach + # the seperate html page which contains the properties and their values - #using this cid to get the right url and scrape it - requests.append(Request(url=self.website_pubchem[:-2].replace("\\","") + self.data_url % cid, callback=self.parse_data)) + # using this cid to get the right url and scrape it + requests.append( + Request(url=self.website_pubchem[:-2].replace("\\", "") + self.data_url % cid, callback=self.parse_data)) return requests - def parse_data(self, response): + @staticmethod + def parse_data(response): """ Parse data found in 'Chemical and Physical properties' part of a substance page. :param response: The response with the page to parse @@ -74,8 +76,8 @@ class PubChem(Source): props = sel.xpath('//div') for prop in props: - prop_name = ''.join(prop.xpath('b/text()').extract()) # name of property that it is parsing - if prop.xpath('a'): # parsing for single value in property + prop_name = ''.join(prop.xpath('b/text()').extract()) # name of property that it is parsing + if prop.xpath('a'): # parsing for single value in property prop_source = ''.join(prop.xpath('a/@title').extract()) prop_value = ''.join(prop.xpath('a/text()').extract()) new_prop = Result({ @@ -89,7 +91,7 @@ class PubChem(Source): (new_prop['attribute'], new_prop['value'], new_prop['source']), level=log.DEBUG) requests.append(new_prop) - elif prop.xpath('ul'): # parsing for multiple values (list) in property + elif prop.xpath('ul'): # parsing for multiple values (list) in property prop_values = prop.xpath('ul//li') for prop_li in prop_values: prop_value = ''.join(prop_li.xpath('a/text()').extract()) @@ -102,8 +104,8 @@ class PubChem(Source): 'conditions': '' }) log.msg('PubChem prop: |%s| |%s| |%s|' % - (new_prop['attribute'], new_prop['value'], - new_prop['source']), level=log.DEBUG) + (new_prop['attribute'], new_prop['value'], + new_prop['source']), level=log.DEBUG) requests.append(new_prop) return requests @@ -116,7 +118,7 @@ class PubChem(Source): case the search request forwarded to the compound page """ - #check if pubchem forwarded straight to compound page + # check if pubchem forwarded straight to compound page m = re.match(self.website_pubchem, response.url) if m: log.msg('PubChem search forwarded to compound page', diff --git a/fourmi.py b/fourmi.py index e45d605..f0caa05 100755 --- a/fourmi.py +++ b/fourmi.py @@ -17,7 +17,7 @@ Options: --version Show version. -v Verbose logging output. (Multiple occurrences increase logging level) --log= Save log to an file. - -o --output= Output file [default: results.*format*] + -o --output= Output file [default: .*format*] -f --format= Output formats (supported: csv, json, jsonlines, xml) [default: csv] --include= Include only sources that match these regular expressions split by a comma. --exclude= Exclude the sources that match these regular expressions split by a comma. @@ -58,12 +58,12 @@ def search(docopt_arguments, source_loader): """ conf = Configurator() conf.set_logging(docopt_arguments["--log"], docopt_arguments["-v"]) - conf.set_output(docopt_arguments["--output"], docopt_arguments["--format"]) + conf.set_output(docopt_arguments["--output"], docopt_arguments["--format"], docopt_arguments[""]) setup_crawler(docopt_arguments[""], conf.scrapy_settings, source_loader, docopt_arguments["--attributes"].split(',')) if conf.scrapy_settings.getbool("LOG_ENABLED"): log.start(conf.scrapy_settings.get("LOG_FILE"), - conf.scrapy_settings.get("LOG_LEVEL"), conf.scrapy_settings.get("LOG_STDOUT")) + conf.scrapy_settings.get("LOG_LEVEL"), conf.scrapy_settings.get("LOG_STDOUT")) reactor.run() diff --git a/tests/test_configurator.py b/tests/test_configurator.py index df29da9..0eb593d 100644 --- a/tests/test_configurator.py +++ b/tests/test_configurator.py @@ -10,16 +10,16 @@ class TestConfigurator(unittest.TestCase): self.conf = Configurator() def test_set_output(self): - self.conf.set_output(filename="test.txt", fileformat="csv") + self.conf.set_output(filename="test.txt", fileformat="csv", compound="test") self.assertEqual(self.conf.scrapy_settings["FEED_URI"], "test.txt") self.assertEqual(self.conf.scrapy_settings["FEED_FORMAT"], "csv") - self.conf.set_output("results.*format*", "jsonlines") - self.assertEqual(self.conf.scrapy_settings["FEED_URI"], "results.json") + self.conf.set_output(".*format*", "jsonlines", "test") + self.assertEqual(self.conf.scrapy_settings["FEED_URI"], "test.json") self.assertEqual(self.conf.scrapy_settings["FEED_FORMAT"], "jsonlines") - self.conf.set_output("results.*format*", "csv") - self.assertEqual(self.conf.scrapy_settings["FEED_URI"], "results.csv") + self.conf.set_output("results.*format*", "csv", "test") + self.assertEqual(self.conf.scrapy_settings["FEED_URI"], "test.csv") self.assertEqual(self.conf.scrapy_settings["FEED_FORMAT"], "csv") def test_start_log(self): diff --git a/utils/configurator.py b/utils/configurator.py index 358adc7..2db7cdb 100644 --- a/utils/configurator.py +++ b/utils/configurator.py @@ -3,6 +3,7 @@ import os from scrapy.utils.project import get_project_settings + class Configurator: """ A helper class in the fourmi class. This class is used to process the settings as set @@ -12,7 +13,7 @@ class Configurator: def __init__(self): self.scrapy_settings = get_project_settings() - def set_output(self, filename, fileformat): + def set_output(self, filename, fileformat, compound): """ This function manipulates the Scrapy output file settings that normally would be set in the settings file. In the Fourmi project these are command line arguments. @@ -20,12 +21,12 @@ class Configurator: :param fileformat: The format in which the output will be. """ - if filename != 'results.*format*': + if filename != '.*format*': self.scrapy_settings.overrides["FEED_URI"] = filename elif fileformat == "jsonlines": - self.scrapy_settings.overrides["FEED_URI"] = "results.json" + self.scrapy_settings.overrides["FEED_URI"] = compound + ".json" elif fileformat is not None: - self.scrapy_settings.overrides["FEED_URI"] = "results." + fileformat + self.scrapy_settings.overrides["FEED_URI"] = compound + "." + fileformat if fileformat is not None: self.scrapy_settings.overrides["FEED_FORMAT"] = fileformat