Archived
1
0

Merge branch 'release/0.3.1'

This commit is contained in:
Jip J. Dekker 2014-05-22 12:20:09 +02:00
commit e7c17ea269
3 changed files with 20 additions and 3 deletions

View File

@ -5,6 +5,22 @@
import re
from scrapy.exceptions import DropItem
class RemoveNonePipeline(object):
def __init__(self):
self.known_values = set()
def process_item(self, item, spider):
"""
Processing the items so None values are replaced by empty strings
:param item: The incoming item
:param spider: The spider which scraped the spider
:return: :raise DropItem: Returns the item if unique or drops them if it's already known
"""
for key in item:
if item[key] is None:
item[key] = ""
return item
class DuplicatePipeline(object):

View File

@ -11,8 +11,9 @@ BOT_NAME = 'FourmiCrawler'
SPIDER_MODULES = ['FourmiCrawler']
NEWSPIDER_MODULE = 'FourmiCrawler'
ITEM_PIPELINES = {
'FourmiCrawler.pipelines.AttributeSelectionPipeline': 100,
'FourmiCrawler.pipelines.DuplicatePipeline': 200,
"FourmiCrawler.pipelines.RemoveNonePipeline": 100,
'FourmiCrawler.pipelines.AttributeSelectionPipeline': 200,
'FourmiCrawler.pipelines.DuplicatePipeline': 300,
}
FEED_URI = 'results.json'
FEED_FORMAT = 'jsonlines'

View File

@ -80,7 +80,7 @@ def search(docopt_arguments, source_loader):
if __name__ == '__main__':
arguments = docopt.docopt(__doc__, version='Fourmi - V0.3.0')
arguments = docopt.docopt(__doc__, version='Fourmi - V0.3.1')
loader = SourceLoader()
if arguments["--include"]: