From f8d390d3e604bedc2a428cc24824830a8bc31d5a Mon Sep 17 00:00:00 2001
From: Bas Vb <bas.berkel@student.ru.nl>
Date: Thu, 1 May 2014 15:04:11 +0200
Subject: [PATCH 1/3] Starting with fixing the wikiparser

---
 FourmiCrawler/sources/WikipediaParser.py | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

diff --git a/FourmiCrawler/sources/WikipediaParser.py b/FourmiCrawler/sources/WikipediaParser.py
index c251fca..8d8cded 100644
--- a/FourmiCrawler/sources/WikipediaParser.py
+++ b/FourmiCrawler/sources/WikipediaParser.py
@@ -37,7 +37,7 @@ class WikipediaParser(Source):
         items = []
 
         #be sure to get both chembox (wikipedia template) and drugbox (wikipedia template) to scrape
-        tr_list = sel.xpath('.//table[@class="infobox bordered" or @class="infobox"]//td[not(@colspan)]').\
+        tr_list = sel.xpath('.//table[@class="infobox bordered"]//td[not(@colspan)]').\
             xpath('normalize-space(string())')
         prop_names = tr_list[::2]
         prop_values = tr_list[1::2]
@@ -51,6 +51,23 @@ class WikipediaParser(Source):
             })
             items.append(item)
             log.msg('Wiki prop: |%s| |%s| |%s|' % (item['attribute'], item['value'], item['source']), level=log.DEBUG)
+
+        tr_list2 = sel.xpath('.//table[@class="infobox"]//tr').\
+            xpath('normalize-space(string())')
+        log.msg('%s' %tr_list2,level=log.DEBUG)
+        #prop_names = tr_list2[::2]
+        #prop_values = tr_list2[1::2]
+        #for i, prop_name in enumerate(prop_names):
+        #    item = Result({
+        #        'attribute': prop_name.extract().encode('utf-8'),
+        #        'value': prop_values[i].extract().encode('utf-8'),
+        #        'source': "Wikipedia",
+        #        'reliability': "",
+        #        'conditions': ""
+        #    })
+        #    items.append(item)
+        #    log.msg('Wiki prop: |%s| |%s| |%s|' % (item['attribute'], item['value'], item['source']), level=log.DEBUG)
+
         items = filter(lambda a: a['value'] != '', items)  # remove items with an empty value
         item_list = self.clean_items(items)
 

From 03e652d454e34dbc30d9f2fa3c6f32ef57845e01 Mon Sep 17 00:00:00 2001
From: Bas Vb <bas.berkel@student.ru.nl>
Date: Thu, 1 May 2014 16:05:37 +0200
Subject: [PATCH 2/3] Wikipediaparser now works on chemboxes as well

---
 FourmiCrawler/sources/WikipediaParser.py | 31 ++++++++++++------------
 1 file changed, 15 insertions(+), 16 deletions(-)

diff --git a/FourmiCrawler/sources/WikipediaParser.py b/FourmiCrawler/sources/WikipediaParser.py
index 8d8cded..2964567 100644
--- a/FourmiCrawler/sources/WikipediaParser.py
+++ b/FourmiCrawler/sources/WikipediaParser.py
@@ -46,27 +46,26 @@ class WikipediaParser(Source):
                 'attribute': prop_name.extract().encode('utf-8'),
                 'value': prop_values[i].extract().encode('utf-8'),
                 'source': "Wikipedia",
-                'reliability': "",
+                'reliability': "Unknown",
                 'conditions': ""
             })
             items.append(item)
             log.msg('Wiki prop: |%s| |%s| |%s|' % (item['attribute'], item['value'], item['source']), level=log.DEBUG)
 
-        tr_list2 = sel.xpath('.//table[@class="infobox"]//tr').\
-            xpath('normalize-space(string())')
-        log.msg('%s' %tr_list2,level=log.DEBUG)
-        #prop_names = tr_list2[::2]
-        #prop_values = tr_list2[1::2]
-        #for i, prop_name in enumerate(prop_names):
-        #    item = Result({
-        #        'attribute': prop_name.extract().encode('utf-8'),
-        #        'value': prop_values[i].extract().encode('utf-8'),
-        #        'source': "Wikipedia",
-        #        'reliability': "",
-        #        'conditions': ""
-        #    })
-        #    items.append(item)
-        #    log.msg('Wiki prop: |%s| |%s| |%s|' % (item['attribute'], item['value'], item['source']), level=log.DEBUG)
+        tr_list2 = sel.xpath('.//table[@class="infobox"]//tr')#.xpath('normalize-space(string())')
+        log.msg('dit: %s' %tr_list2,level=log.DEBUG)
+        for tablerow in tr_list2:
+            log.msg('item: %s' %tablerow.xpath('./th').xpath('normalize-space(string())'),level=log.DEBUG)
+            if tablerow.xpath('./th').xpath('normalize-space(string())') and tablerow.xpath('./td').xpath('normalize-space(string())'):
+                item = Result({
+                    'attribute': tablerow.xpath('./th').xpath('normalize-space(string())').extract()[0].encode('utf-8'),
+                    'value': tablerow.xpath('./td').xpath('normalize-space(string())').extract()[0].encode('utf-8'),
+                    'source': "Wikipedia",
+                    'reliability': "Unknown",
+                    'conditions': ""
+                })
+                items.append(item)
+                log.msg('Wiki prop: |attribute: %s| |value: %s| |%s|' % (item['attribute'], item['value'], item['source']), level=log.DEBUG)
 
         items = filter(lambda a: a['value'] != '', items)  # remove items with an empty value
         item_list = self.clean_items(items)

From b54568bab0281ab80ef9ce2e4ec3a94138322447 Mon Sep 17 00:00:00 2001
From: Bas Vb <bas.berkel@student.ru.nl>
Date: Tue, 13 May 2014 16:18:32 +0200
Subject: [PATCH 3/3] Small fixes

---
 FourmiCrawler/sources/WikipediaParser.py | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/FourmiCrawler/sources/WikipediaParser.py b/FourmiCrawler/sources/WikipediaParser.py
index 2964567..cb7d0b9 100644
--- a/FourmiCrawler/sources/WikipediaParser.py
+++ b/FourmiCrawler/sources/WikipediaParser.py
@@ -36,8 +36,8 @@ class WikipediaParser(Source):
         """ scrape data from infobox on wikipedia. """
         items = []
 
-        #be sure to get both chembox (wikipedia template) and drugbox (wikipedia template) to scrape
-        tr_list = sel.xpath('.//table[@class="infobox bordered"]//td[not(@colspan)]').\
+        #be sure to get chembox (wikipedia template)
+        tr_list = sel.xpath('.//table[@class="infobox bordered"]//td[not(@colspan)]'). \
             xpath('normalize-space(string())')
         prop_names = tr_list[::2]
         prop_values = tr_list[1::2]
@@ -52,11 +52,13 @@ class WikipediaParser(Source):
             items.append(item)
             log.msg('Wiki prop: |%s| |%s| |%s|' % (item['attribute'], item['value'], item['source']), level=log.DEBUG)
 
-        tr_list2 = sel.xpath('.//table[@class="infobox"]//tr')#.xpath('normalize-space(string())')
-        log.msg('dit: %s' %tr_list2,level=log.DEBUG)
+        #scrape the  drugbox (wikipedia template)
+        tr_list2 = sel.xpath('.//table[@class="infobox"]//tr')
+        log.msg('dit: %s' % tr_list2, level=log.DEBUG)
         for tablerow in tr_list2:
-            log.msg('item: %s' %tablerow.xpath('./th').xpath('normalize-space(string())'),level=log.DEBUG)
-            if tablerow.xpath('./th').xpath('normalize-space(string())') and tablerow.xpath('./td').xpath('normalize-space(string())'):
+            log.msg('item: %s' % tablerow.xpath('./th').xpath('normalize-space(string())'), level=log.DEBUG)
+            if tablerow.xpath('./th').xpath('normalize-space(string())') and tablerow.xpath('./td').xpath(
+                    'normalize-space(string())'):
                 item = Result({
                     'attribute': tablerow.xpath('./th').xpath('normalize-space(string())').extract()[0].encode('utf-8'),
                     'value': tablerow.xpath('./td').xpath('normalize-space(string())').extract()[0].encode('utf-8'),
@@ -65,7 +67,9 @@ class WikipediaParser(Source):
                     'conditions': ""
                 })
                 items.append(item)
-                log.msg('Wiki prop: |attribute: %s| |value: %s| |%s|' % (item['attribute'], item['value'], item['source']), level=log.DEBUG)
+                log.msg(
+                    'Wiki prop: |attribute: %s| |value: %s| |%s|' % (item['attribute'], item['value'], item['source']),
+                    level=log.DEBUG)
 
         items = filter(lambda a: a['value'] != '', items)  # remove items with an empty value
         item_list = self.clean_items(items)