From 98f91a1aa9b8b0fdce5a6c9de903695a46d0dcc9 Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Thu, 22 May 2014 12:15:43 +0200 Subject: [PATCH 1/2] Added a pipeline to replace None values with empty strings --- FourmiCrawler/pipelines.py | 16 ++++++++++++++++ FourmiCrawler/settings.py | 5 +++-- 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/FourmiCrawler/pipelines.py b/FourmiCrawler/pipelines.py index e1dadbf..2c775f2 100644 --- a/FourmiCrawler/pipelines.py +++ b/FourmiCrawler/pipelines.py @@ -5,6 +5,22 @@ import re from scrapy.exceptions import DropItem +class RemoveNonePipeline(object): + + def __init__(self): + self.known_values = set() + + def process_item(self, item, spider): + """ + Processing the items so None values are replaced by empty strings + :param item: The incoming item + :param spider: The spider which scraped the spider + :return: :raise DropItem: Returns the item if unique or drops them if it's already known + """ + for key in item: + if item[key] is None: + item[key] = "" + return item class DuplicatePipeline(object): diff --git a/FourmiCrawler/settings.py b/FourmiCrawler/settings.py index d7ac212..be7c451 100644 --- a/FourmiCrawler/settings.py +++ b/FourmiCrawler/settings.py @@ -11,8 +11,9 @@ BOT_NAME = 'FourmiCrawler' SPIDER_MODULES = ['FourmiCrawler'] NEWSPIDER_MODULE = 'FourmiCrawler' ITEM_PIPELINES = { - 'FourmiCrawler.pipelines.AttributeSelectionPipeline': 100, - 'FourmiCrawler.pipelines.DuplicatePipeline': 200, + "FourmiCrawler.pipelines.RemoveNonePipeline": 100, + 'FourmiCrawler.pipelines.AttributeSelectionPipeline': 200, + 'FourmiCrawler.pipelines.DuplicatePipeline': 300, } FEED_URI = 'results.json' FEED_FORMAT = 'jsonlines' From f80a32a0dcb0e5216d3301969e6f360fc1d8cc31 Mon Sep 17 00:00:00 2001 From: "Jip J. Dekker" Date: Thu, 22 May 2014 12:19:41 +0200 Subject: [PATCH 2/2] Pushed the version number --- fourmi.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fourmi.py b/fourmi.py index efa4e54..08e010b 100755 --- a/fourmi.py +++ b/fourmi.py @@ -80,7 +80,7 @@ def search(docopt_arguments, source_loader): if __name__ == '__main__': - arguments = docopt.docopt(__doc__, version='Fourmi - V0.3.0') + arguments = docopt.docopt(__doc__, version='Fourmi - V0.3.1') loader = SourceLoader() if arguments["--include"]: