Merge branch 'feature/attribute-selection' into develop

2014-05-08 15:48:44 +02:00 · 2014-05-08 15:48:44 +02:00 · 4203a65b57
commit 4203a65b57
parent 81886981a3 f193aac24a
4 changed files with 31 additions and 9 deletions
--- a/FourmiCrawler/pipelines.py
+++ b/FourmiCrawler/pipelines.py
@ -2,10 +2,11 @@
 #
 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
+import re
 from scrapy.exceptions import DropItem


-class FourmiPipeline(object):
+class DuplicatePipeline(object):

    def __init__(self):
        self.known_values = set()
@ -17,9 +18,27 @@ class FourmiPipeline(object):
        :param spider: The spider which scraped the spider
        :return: :raise DropItem: Returns the item if unique or drops them if it's already known
        """
-        value = item['attribute'], item['value']
+        value = (item['attribute'], item['value'], item['conditions'])
        if value in self.known_values:
            raise DropItem("Duplicate item found: %s" % item) # #[todo] append sources of first item.
        else:
            self.known_values.add(value)
            return item
+
+class AttributeSelectionPipeline(object):
+
+    def __init__(self):
+        pass;
+
+    def process_item(self, item, spider):
+        """
+        The items are processed using the selected attribute list available in the spider,
+        items that don't match the selected items are dropped.
+        :param item: The incoming item
+        :param spider: The spider which scraped the item. Should have an attribute "selected_attributes".
+        :return: :raise DropItem: Returns item if it matches an selected attribute, else it is dropped.
+        """
+        if [x for x in spider.selected_attributes if re.match(x, item["attribute"])]:
+            return item
+        else:
+            raise DropItem("Attribute not selected by used: %s" % item)
--- a/FourmiCrawler/settings.py
+++ b/FourmiCrawler/settings.py
@ -11,7 +11,8 @@ BOT_NAME = 'FourmiCrawler'
 SPIDER_MODULES = ['FourmiCrawler']
 NEWSPIDER_MODULE = 'FourmiCrawler'
 ITEM_PIPELINES = {
-    'FourmiCrawler.pipelines.FourmiPipeline': 100
+    'FourmiCrawler.pipelines.AttributeSelectionPipeline': 100,
+    'FourmiCrawler.pipelines.DuplicatePipeline': 200,
 }
 FEED_URI = 'results.json'
 FEED_FORMAT = 'jsonlines'
--- a/FourmiCrawler/spider.py
+++ b/FourmiCrawler/spider.py
@ -8,9 +8,10 @@ class FourmiSpider(Spider):
    __parsers = []
    synonyms = []

-    def __init__(self, compound=None, *args, **kwargs):
+    def __init__(self, compound=None, selected_attributes=[".*"], *args, **kwargs):
        super(FourmiSpider, self).__init__(*args, **kwargs)
        self.synonyms.append(compound)
+        self.selected_attributes = selected_attributes;

    def parse(self, reponse):
        for parser in self.__parsers:
--- a/fourmi.py
+++ b/fourmi.py
@ -12,14 +12,15 @@ Usage:
    fourmi --version

 Options:
+    --attributes=<regex>            Include only that match these regular expressions split by a comma. [default: .*]
    -h --help                       Show this screen.
    --version                       Show version.
    --verbose                       Verbose logging output.
    --log=<file>                    Save log to an file.
    -o <file> --output=<file>       Output file [default: result.*format*]
    -f <format> --format=<format>   Output formats (supported: csv, json, jsonlines, xml) [default: jsonlines]
-    --include=<sourcenames>         Include only sources that match the regular these expressions split by a comma.
-    --exclude=<sourcenames>         Exclude the sources that match the regular these expressions split by a comma.
+    --include=<regex>               Include only sources that match these regular expressions split by a comma.
+    --exclude=<regex>               Exclude the sources that match these regular expressions split by a comma.
 """

 from twisted.internet import reactor
@ -32,8 +33,8 @@ from FourmiCrawler.spider import FourmiSpider
 from sourceloader import SourceLoader


-def setup_crawler(searchable, settings, source_loader):
-    spider = FourmiSpider(compound=searchable)
+def setup_crawler(searchable, settings, source_loader, attributes):
+    spider = FourmiSpider(compound=searchable, selected_attributes=attributes)
    spider.add_parsers(source_loader.sources)
    crawler = Crawler(settings)
    crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
@ -74,7 +75,7 @@ def start_log(docopt_arguments):
 def search(docopt_arguments, source_loader):
    start_log(docopt_arguments)
    settings = scrapy_settings_manipulation(docopt_arguments)
-    setup_crawler(docopt_arguments["<compound>"], settings, source_loader)
+    setup_crawler(docopt_arguments["<compound>"], settings, source_loader, docopt_arguments["--attributes"].split(','))
    reactor.run()