Merge branch 'release/0.3.1'
This commit is contained in:
commit
e7c17ea269
@ -5,6 +5,22 @@
|
|||||||
import re
|
import re
|
||||||
from scrapy.exceptions import DropItem
|
from scrapy.exceptions import DropItem
|
||||||
|
|
||||||
|
class RemoveNonePipeline(object):
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.known_values = set()
|
||||||
|
|
||||||
|
def process_item(self, item, spider):
|
||||||
|
"""
|
||||||
|
Processing the items so None values are replaced by empty strings
|
||||||
|
:param item: The incoming item
|
||||||
|
:param spider: The spider which scraped the spider
|
||||||
|
:return: :raise DropItem: Returns the item if unique or drops them if it's already known
|
||||||
|
"""
|
||||||
|
for key in item:
|
||||||
|
if item[key] is None:
|
||||||
|
item[key] = ""
|
||||||
|
return item
|
||||||
|
|
||||||
class DuplicatePipeline(object):
|
class DuplicatePipeline(object):
|
||||||
|
|
||||||
|
@ -11,8 +11,9 @@ BOT_NAME = 'FourmiCrawler'
|
|||||||
SPIDER_MODULES = ['FourmiCrawler']
|
SPIDER_MODULES = ['FourmiCrawler']
|
||||||
NEWSPIDER_MODULE = 'FourmiCrawler'
|
NEWSPIDER_MODULE = 'FourmiCrawler'
|
||||||
ITEM_PIPELINES = {
|
ITEM_PIPELINES = {
|
||||||
'FourmiCrawler.pipelines.AttributeSelectionPipeline': 100,
|
"FourmiCrawler.pipelines.RemoveNonePipeline": 100,
|
||||||
'FourmiCrawler.pipelines.DuplicatePipeline': 200,
|
'FourmiCrawler.pipelines.AttributeSelectionPipeline': 200,
|
||||||
|
'FourmiCrawler.pipelines.DuplicatePipeline': 300,
|
||||||
}
|
}
|
||||||
FEED_URI = 'results.json'
|
FEED_URI = 'results.json'
|
||||||
FEED_FORMAT = 'jsonlines'
|
FEED_FORMAT = 'jsonlines'
|
||||||
|
@ -80,7 +80,7 @@ def search(docopt_arguments, source_loader):
|
|||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
arguments = docopt.docopt(__doc__, version='Fourmi - V0.3.0')
|
arguments = docopt.docopt(__doc__, version='Fourmi - V0.3.1')
|
||||||
loader = SourceLoader()
|
loader = SourceLoader()
|
||||||
|
|
||||||
if arguments["--include"]:
|
if arguments["--include"]:
|
||||||
|
Reference in New Issue
Block a user