Archived
1
0

Code reformat

This commit is contained in:
Jip J. Dekker 2014-06-04 19:34:23 +02:00
parent b3c230e835
commit 046fbed3cd
9 changed files with 19 additions and 23 deletions

View File

@ -4,8 +4,8 @@ import re
from scrapy.exceptions import DropItem from scrapy.exceptions import DropItem
class RemoveNonePipeline(object):
class RemoveNonePipeline(object):
def __init__(self): def __init__(self):
pass pass
@ -21,8 +21,8 @@ class RemoveNonePipeline(object):
item[key] = "" item[key] = ""
return item return item
class DuplicatePipeline(object):
class DuplicatePipeline(object):
def __init__(self): def __init__(self):
self.known_values = set() self.known_values = set()
@ -35,13 +35,13 @@ class DuplicatePipeline(object):
""" """
value = (item['attribute'], item['value'], item['conditions']) value = (item['attribute'], item['value'], item['conditions'])
if value in self.known_values: if value in self.known_values:
raise DropItem("Duplicate item found: %s" % item) #[todo] append sources of first item. raise DropItem("Duplicate item found: %s" % item) # [todo] append sources of first item.
else: else:
self.known_values.add(value) self.known_values.add(value)
return item return item
class AttributeSelectionPipeline(object):
class AttributeSelectionPipeline(object):
def __init__(self): def __init__(self):
pass; pass;

View File

@ -3,7 +3,7 @@
# For simplicity, this file contains only the most important settings by # For simplicity, this file contains only the most important settings by
# default. All the other settings are documented here: # default. All the other settings are documented here:
# #
# http://doc.scrapy.org/en/latest/topics/settings.html # http://doc.scrapy.org/en/latest/topics/settings.html
# #
BOT_NAME = 'FourmiCrawler' BOT_NAME = 'FourmiCrawler'

View File

@ -63,7 +63,7 @@ class ChemSpider(Source):
# Test for properties without values, with one hardcoded exception # Test for properties without values, with one hardcoded exception
if (not re.match(r'^\d', prop_value) or if (not re.match(r'^\d', prop_value) or
(prop_name == 'Polarizability' and (prop_name == 'Polarizability' and
prop_value == '10-24cm3')): prop_value == '10-24cm3')):
continue continue
# Match for condition in parentheses # Match for condition in parentheses

View File

@ -10,7 +10,7 @@ from FourmiCrawler.items import Result
# [TODO]: values can be '128.', perhaps remove the dot in that case? # [TODO]: values can be '128.', perhaps remove the dot in that case?
# [TODO]: properties have references and comments which do not exist in the # [TODO]: properties have references and comments which do not exist in the
# Result item, but should be included eventually. # Result item, but should be included eventually.
class NIST(Source): class NIST(Source):
"""NIST Scraper plugin """NIST Scraper plugin
@ -18,7 +18,7 @@ class NIST(Source):
This plugin manages searching for a chemical on the NIST website This plugin manages searching for a chemical on the NIST website
and parsing the resulting page if the chemical exists on NIST. and parsing the resulting page if the chemical exists on NIST.
""" """
website = "http://webbook.nist.gov/*" website = "http://webbook.nist.gov/*"
search = 'cgi/cbook.cgi?Name=%s&Units=SI&cTP=on' search = 'cgi/cbook.cgi?Name=%s&Units=SI&cTP=on'
@ -78,7 +78,7 @@ class NIST(Source):
requests.extend(self.parse_generic_data(table, summary)) requests.extend(self.parse_generic_data(table, summary))
else: else:
log.msg('NIST table: NOT SUPPORTED', level=log.WARNING) log.msg('NIST table: NOT SUPPORTED', level=log.WARNING)
continue #Assume unsupported continue #Assume unsupported
return requests return requests
def parse_generic_info(self, sel): def parse_generic_info(self, sel):
@ -106,7 +106,7 @@ class NIST(Source):
data['IUPAC Standard InChI'] = raw_inchi.extract()[0] data['IUPAC Standard InChI'] = raw_inchi.extract()[0]
raw_inchikey = ul.xpath('li[strong="IUPAC Standard InChIKey:"]' raw_inchikey = ul.xpath('li[strong="IUPAC Standard InChIKey:"]'
'/tt/text()') '/tt/text()')
data['IUPAC Standard InChIKey'] = raw_inchikey.extract()[0] data['IUPAC Standard InChIKey'] = raw_inchikey.extract()[0]
raw_cas_number = ul.xpath('li[strong="CAS Registry Number:"]/text()') raw_cas_number = ul.xpath('li[strong="CAS Registry Number:"]/text()')
@ -132,10 +132,10 @@ class NIST(Source):
results = [] results = []
for tr in table.xpath('tr[td]'): for tr in table.xpath('tr[td]'):
extra_data_url = tr.xpath('td[last()][a="Individual data points"]' extra_data_url = tr.xpath('td[last()][a="Individual data points"]'
'/a/@href').extract() '/a/@href').extract()
if extra_data_url: if extra_data_url:
request = Request(url=self.website[:-1] + extra_data_url[0], request = Request(url=self.website[:-1] + extra_data_url[0],
callback=self.parse_individual_datapoints) callback=self.parse_individual_datapoints)
results.append(request) results.append(request)
continue continue
data = [] data = []
@ -183,7 +183,6 @@ class NIST(Source):
}) })
results.append(result) results.append(result)
return results return results
@staticmethod @staticmethod

View File

@ -38,7 +38,7 @@ class WikipediaParser(Source):
""" scrape data from infobox on wikipedia. """ """ scrape data from infobox on wikipedia. """
items = [] items = []
#be sure to get chembox (wikipedia template) # be sure to get chembox (wikipedia template)
tr_list = sel.xpath('.//table[@class="infobox bordered"]//td[not(@colspan)]'). \ tr_list = sel.xpath('.//table[@class="infobox bordered"]//td[not(@colspan)]'). \
xpath('normalize-space(string())') xpath('normalize-space(string())')
prop_names = tr_list[::2] prop_names = tr_list[::2]

View File

@ -1,4 +1,4 @@
#!/usr/bin/env python # !/usr/bin/env python
""" """
Fourmi, a web scraper build to search specific information for a given compound (and it's pseudonyms). Fourmi, a web scraper build to search specific information for a given compound (and it's pseudonyms).

View File

@ -7,12 +7,11 @@ from FourmiCrawler import pipelines, spider, items
class TestPipelines(unittest.TestCase): class TestPipelines(unittest.TestCase):
def setUp(self): def setUp(self):
self.testItem = items.Result() self.testItem = items.Result()
def test_NonePipeline(self): def test_NonePipeline(self):
#Testing the pipeline that replaces the None values in items. # Testing the pipeline that replaces the None values in items.
self.testItem["value"] = "abc" self.testItem["value"] = "abc"
pipe = pipelines.RemoveNonePipeline() pipe = pipelines.RemoveNonePipeline()
processed = pipe.process_item(self.testItem, spider.FourmiSpider()) processed = pipe.process_item(self.testItem, spider.FourmiSpider())
@ -25,7 +24,7 @@ class TestPipelines(unittest.TestCase):
self.assertIs(processed[key], "") self.assertIs(processed[key], "")
def test_DuplicatePipeline(self): def test_DuplicatePipeline(self):
#Testing the pipeline that removes duplicates. # Testing the pipeline that removes duplicates.
self.testItem["attribute"] = "test" self.testItem["attribute"] = "test"
self.testItem["value"] = "test" self.testItem["value"] = "test"
self.testItem["conditions"] = "test" self.testItem["conditions"] = "test"
@ -39,7 +38,7 @@ class TestPipelines(unittest.TestCase):
self.assertEqual(pipe.process_item(otherItem, spider.FourmiSpider()), otherItem) self.assertEqual(pipe.process_item(otherItem, spider.FourmiSpider()), otherItem)
def test_AttributeSelection(self): def test_AttributeSelection(self):
#Testing the pipeline that selects attributes. # Testing the pipeline that selects attributes.
item1 = copy.deepcopy(self.testItem) item1 = copy.deepcopy(self.testItem)
item2 = copy.deepcopy(self.testItem) item2 = copy.deepcopy(self.testItem)

View File

@ -4,7 +4,6 @@ from sourceloader import SourceLoader
class TestSourceloader(unittest.TestCase): class TestSourceloader(unittest.TestCase):
def setUp(self): def setUp(self):
self.loader = SourceLoader() self.loader = SourceLoader()
@ -16,7 +15,7 @@ class TestSourceloader(unittest.TestCase):
self.assertIn("Source: WikipediaParser", str(self.loader)) self.assertIn("Source: WikipediaParser", str(self.loader))
def test_include(self): def test_include(self):
#Tests for the include functionality. # Tests for the include functionality.
self.loader.include(["So.rc.*"]) self.loader.include(["So.rc.*"])
self.assertIn("Source: Source", str(self.loader)) self.assertIn("Source: Source", str(self.loader))
@ -25,7 +24,7 @@ class TestSourceloader(unittest.TestCase):
self.assertNotIn("Source: WikipediaParser", str(self.loader)) self.assertNotIn("Source: WikipediaParser", str(self.loader))
def test_exclude(self): def test_exclude(self):
#Tests for the exclude functionality. # Tests for the exclude functionality.
self.loader.exclude(["So.rc.*"]) self.loader.exclude(["So.rc.*"])
self.assertNotIn("Source: Source", str(self.loader)) self.assertNotIn("Source: Source", str(self.loader))

View File

@ -8,7 +8,6 @@ from FourmiCrawler.sources.source import Source
class TestFoumiSpider(unittest.TestCase): class TestFoumiSpider(unittest.TestCase):
def setUp(self): def setUp(self):
self.compound = "test_compound" self.compound = "test_compound"
self.attributes = ["a.*", ".*a"] self.attributes = ["a.*", ".*a"]