diff --git a/FourmiCrawler/pipelines.py b/FourmiCrawler/pipelines.py index 1bcba3a..dd4e11d 100644 --- a/FourmiCrawler/pipelines.py +++ b/FourmiCrawler/pipelines.py @@ -4,8 +4,8 @@ import re from scrapy.exceptions import DropItem -class RemoveNonePipeline(object): +class RemoveNonePipeline(object): def __init__(self): pass @@ -21,8 +21,8 @@ class RemoveNonePipeline(object): item[key] = "" return item -class DuplicatePipeline(object): +class DuplicatePipeline(object): def __init__(self): self.known_values = set() @@ -35,13 +35,13 @@ class DuplicatePipeline(object): """ value = (item['attribute'], item['value'], item['conditions']) if value in self.known_values: - raise DropItem("Duplicate item found: %s" % item) #[todo] append sources of first item. + raise DropItem("Duplicate item found: %s" % item) # [todo] append sources of first item. else: self.known_values.add(value) return item -class AttributeSelectionPipeline(object): +class AttributeSelectionPipeline(object): def __init__(self): pass; diff --git a/FourmiCrawler/settings.py b/FourmiCrawler/settings.py index be7c451..8c1df07 100644 --- a/FourmiCrawler/settings.py +++ b/FourmiCrawler/settings.py @@ -3,7 +3,7 @@ # For simplicity, this file contains only the most important settings by # default. All the other settings are documented here: # -# http://doc.scrapy.org/en/latest/topics/settings.html +# http://doc.scrapy.org/en/latest/topics/settings.html # BOT_NAME = 'FourmiCrawler' diff --git a/FourmiCrawler/sources/ChemSpider.py b/FourmiCrawler/sources/ChemSpider.py index 254c1a5..dfada5f 100644 --- a/FourmiCrawler/sources/ChemSpider.py +++ b/FourmiCrawler/sources/ChemSpider.py @@ -63,7 +63,7 @@ class ChemSpider(Source): # Test for properties without values, with one hardcoded exception if (not re.match(r'^\d', prop_value) or (prop_name == 'Polarizability' and - prop_value == '10-24cm3')): + prop_value == '10-24cm3')): continue # Match for condition in parentheses diff --git a/FourmiCrawler/sources/NIST.py b/FourmiCrawler/sources/NIST.py index 2fe5966..a5f784d 100644 --- a/FourmiCrawler/sources/NIST.py +++ b/FourmiCrawler/sources/NIST.py @@ -10,7 +10,7 @@ from FourmiCrawler.items import Result # [TODO]: values can be '128.', perhaps remove the dot in that case? # [TODO]: properties have references and comments which do not exist in the -# Result item, but should be included eventually. +# Result item, but should be included eventually. class NIST(Source): """NIST Scraper plugin @@ -18,7 +18,7 @@ class NIST(Source): This plugin manages searching for a chemical on the NIST website and parsing the resulting page if the chemical exists on NIST. """ - website = "http://webbook.nist.gov/*" + website = "http://webbook.nist.gov/*" search = 'cgi/cbook.cgi?Name=%s&Units=SI&cTP=on' @@ -78,7 +78,7 @@ class NIST(Source): requests.extend(self.parse_generic_data(table, summary)) else: log.msg('NIST table: NOT SUPPORTED', level=log.WARNING) - continue #Assume unsupported + continue #Assume unsupported return requests def parse_generic_info(self, sel): @@ -106,7 +106,7 @@ class NIST(Source): data['IUPAC Standard InChI'] = raw_inchi.extract()[0] raw_inchikey = ul.xpath('li[strong="IUPAC Standard InChIKey:"]' - '/tt/text()') + '/tt/text()') data['IUPAC Standard InChIKey'] = raw_inchikey.extract()[0] raw_cas_number = ul.xpath('li[strong="CAS Registry Number:"]/text()') @@ -132,10 +132,10 @@ class NIST(Source): results = [] for tr in table.xpath('tr[td]'): extra_data_url = tr.xpath('td[last()][a="Individual data points"]' - '/a/@href').extract() + '/a/@href').extract() if extra_data_url: request = Request(url=self.website[:-1] + extra_data_url[0], - callback=self.parse_individual_datapoints) + callback=self.parse_individual_datapoints) results.append(request) continue data = [] @@ -183,7 +183,6 @@ class NIST(Source): }) results.append(result) - return results @staticmethod diff --git a/FourmiCrawler/sources/WikipediaParser.py b/FourmiCrawler/sources/WikipediaParser.py index c4f7a0f..868b49f 100644 --- a/FourmiCrawler/sources/WikipediaParser.py +++ b/FourmiCrawler/sources/WikipediaParser.py @@ -38,7 +38,7 @@ class WikipediaParser(Source): """ scrape data from infobox on wikipedia. """ items = [] - #be sure to get chembox (wikipedia template) + # be sure to get chembox (wikipedia template) tr_list = sel.xpath('.//table[@class="infobox bordered"]//td[not(@colspan)]'). \ xpath('normalize-space(string())') prop_names = tr_list[::2] diff --git a/fourmi.py b/fourmi.py index b4c2b48..683e257 100755 --- a/fourmi.py +++ b/fourmi.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +# !/usr/bin/env python """ Fourmi, a web scraper build to search specific information for a given compound (and it's pseudonyms). diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py index f1fab36..ab97954 100644 --- a/tests/test_pipeline.py +++ b/tests/test_pipeline.py @@ -7,12 +7,11 @@ from FourmiCrawler import pipelines, spider, items class TestPipelines(unittest.TestCase): - def setUp(self): self.testItem = items.Result() def test_NonePipeline(self): - #Testing the pipeline that replaces the None values in items. + # Testing the pipeline that replaces the None values in items. self.testItem["value"] = "abc" pipe = pipelines.RemoveNonePipeline() processed = pipe.process_item(self.testItem, spider.FourmiSpider()) @@ -25,7 +24,7 @@ class TestPipelines(unittest.TestCase): self.assertIs(processed[key], "") def test_DuplicatePipeline(self): - #Testing the pipeline that removes duplicates. + # Testing the pipeline that removes duplicates. self.testItem["attribute"] = "test" self.testItem["value"] = "test" self.testItem["conditions"] = "test" @@ -39,7 +38,7 @@ class TestPipelines(unittest.TestCase): self.assertEqual(pipe.process_item(otherItem, spider.FourmiSpider()), otherItem) def test_AttributeSelection(self): - #Testing the pipeline that selects attributes. + # Testing the pipeline that selects attributes. item1 = copy.deepcopy(self.testItem) item2 = copy.deepcopy(self.testItem) diff --git a/tests/test_sourceloader.py b/tests/test_sourceloader.py index b130e8d..1afca2d 100644 --- a/tests/test_sourceloader.py +++ b/tests/test_sourceloader.py @@ -4,7 +4,6 @@ from sourceloader import SourceLoader class TestSourceloader(unittest.TestCase): - def setUp(self): self.loader = SourceLoader() @@ -16,7 +15,7 @@ class TestSourceloader(unittest.TestCase): self.assertIn("Source: WikipediaParser", str(self.loader)) def test_include(self): - #Tests for the include functionality. + # Tests for the include functionality. self.loader.include(["So.rc.*"]) self.assertIn("Source: Source", str(self.loader)) @@ -25,7 +24,7 @@ class TestSourceloader(unittest.TestCase): self.assertNotIn("Source: WikipediaParser", str(self.loader)) def test_exclude(self): - #Tests for the exclude functionality. + # Tests for the exclude functionality. self.loader.exclude(["So.rc.*"]) self.assertNotIn("Source: Source", str(self.loader)) diff --git a/tests/test_spider.py b/tests/test_spider.py index f5c8116..66878eb 100644 --- a/tests/test_spider.py +++ b/tests/test_spider.py @@ -8,7 +8,6 @@ from FourmiCrawler.sources.source import Source class TestFoumiSpider(unittest.TestCase): - def setUp(self): self.compound = "test_compound" self.attributes = ["a.*", ".*a"]