RobDWaller · Tomasz-Kluczkowski · Jan 8, 2019 · Jan 8, 2019 · Jan 8, 2019 · Jan 8, 2019
diff --git a/data_extractor.py b/data_extractor.py
@@ -0,0 +1,54 @@
+class DataExtractor:
+    """
+    Use to extract, cleanse, sum and amend incorrect website data collection.
+    """
+    def __init__(self, data):
+        self.data = data
+
+    def find_items(self, value=4):
+        """
+        Find and return a new list of items where key "value" is greater than or equal to parameter value.
+        :param value: int, value to find items for.
+        :return: list(dict), list of dictionaries matching the above filtering rule.
+        """
+        return [item for item in self.data if item.get('value') and item.get('value') >= value]
+
+    def amend_domain_values(self, prefix='www.'):
+        """
+        Fixes missing parts of the domain names. By default we add missing 'www.'.
+        :param prefix: str, prefix to add to the domain name.
+        :return: amended: list(dict), amended list of web records.
+        """
+        amended = []
+        for item in self.data:
+            if item.get('domain') and not item.get('domain').startswith(prefix):
+                item['domain'] = f"{prefix}{item['domain']}"
+            amended.append(item)
+        return amended
+
+    def cleanse_data(self):
+        """
+        Fix errors in "secure" key values. All urls starting with https should be set to "secure": True, those starting
+        with http "secure": False.
+        :return: amended: list(dict), amended list of web records.
+        """
+        amended = []
+        for item in self.data:
+            url = item.get('url')
+            secure = item.get('secure')
+            if url:
+                # https marked as secure = False
+                if url.startswith('https:') and not secure:
+                    item['secure'] = True
+                # http marked as secure = True
+                elif url.startswith('http:') and secure:
+                    item['secure'] = False
+            amended.append(item)
+        return amended
+
+    def get_value_sum(self):
+        """
+        Returns sum of all value keys in the data set.
+        :return: int, sum of all value keys in the data set.
+        """
+        return sum([item.get('value', 0) for item in self.data])
diff --git a/tests/__init__.py b/tests/__init__.py
diff --git a/tests/test_data_extractor.py b/tests/test_data_extractor.py
@@ -0,0 +1,179 @@
+from data_extractor import DataExtractor
+from websites.resources.data import WEBSITES
+
+data_extractor = DataExtractor(WEBSITES)
+
+
+class TestDataExtractor:
+
+    def test_find_items(self):
+        expected = [
+            {
+                'name': 'Google',
+                'url': 'https://www.google.co.uk',
+                'domain': 'google.co.uk',
+                'secure': True,
+                'value': 5},
+            {
+                'name': 'Facebook',
+                'url': 'https://developers.facebook.com/blog/post/2018/10/02/facebook-login-update/',
+                'domain': 'facebook.com',
+                'secure': True,
+                'value': 4},
+            {
+                'name': 'YouTube',
+                'url': 'https://www.youtube.com/watch?v=09Cd7NKKvDc',
+                'domain': 'youtube.com',
+                'secure': True,
+                'value': 5
+            }
+        ]
+        assert data_extractor.find_items() == expected
+
+    def test_find_items_none_found(self):
+        assert data_extractor.find_items(100) == []
+
+    def test_find_items_all_matching(self):
+        assert data_extractor.find_items(1) == WEBSITES
+
+    def test_amend_domain_values(self):
+        expected = [
+            {
+                'name': 'Google',
+                'url': 'https://www.google.co.uk',
+                'domain': 'www.google.co.uk',
+                'secure': True,
+                'value': 5},
+            {
+                'name': 'Facebook',
+                'url': 'https://developers.facebook.com/blog/post/2018/10/02/facebook-login-update/',
+                'domain': 'www.facebook.com',
+                'secure': True, 'value': 4},
+            {
+                'name': 'Bing',
+                'url': 'https://www.bing.com/search?q=athlete&qs=n&form=QBLH&sp=-1&pq=athlete&sc=8-7&sk=&cvid=53830DD7FB2E47B7A5D9CF27F106BC9A',
+                'domain': 'www.bing.com',
+                'secure': False,
+                'value': 3
+            },
+            {
+                'name': 'Ask',
+                'url': 'https://uk.ask.com/web?o=0&l=dir&qo=serpSearchTopBox&q=jupiter',
+                'domain': 'www.ask.com',
+                'secure': False,
+                'value': 1},
+            {
+                'name': 'Duck Duck Go',
+                'url': 'http://duckduckgo.com/?q=plane&t=h_&ia=web',
+                'domain': 'www.duckduckgo.com',
+                'secure': True,
+                'value': 2
+            },
+            {
+                'name': 'Vimeo',
+                'url': 'https://vimeo.com/53812885',
+                'domain': 'www.vimeo.com',
+                'secure': False,
+                'value': 2
+            },
+            {
+                'name': 'YouTube',
+                'url': 'https://www.youtube.com/watch?v=09Cd7NKKvDc',
+                'domain': 'www.youtube.com',
+                'secure': True,
+                'value': 5
+             },
+            {
+                'name': 'Daily Motion',
+                'url': 'http://www.dailymotion.com/search/football',
+                'domain': 'www.dailymotion.com',
+                'secure': True,
+                'value': 1
+            }
+        ]
+        assert data_extractor.amend_domain_values() == expected
+
+    def test_amend_domain_values_retains_original_if_prefix_matching(self):
+        test_data = [
+            {
+                'name': 'Google',
+                'url': 'https://www.google.co.uk',
+                'domain': 'www.google.co.uk',
+                'secure': True,
+                'value': 5
+            }
+        ]
+        _data_extractor = DataExtractor(test_data)
+        assert _data_extractor.amend_domain_values() == test_data
+
+    def test_cleanse_data(self):
+        test_data = [
+            {
+                'name': 'Google',
+                'url': 'https://www.google.co.uk',
+                'domain': 'google.co.uk',
+                'secure': False,
+                'value': 5
+            },
+            {
+                'name': 'Facebook',
+                'url': 'http://developers.facebook.com/blog/post/2018/10/02/facebook-login-update/',
+                'domain': 'facebook.com',
+                'secure': True,
+                'value': 4
+            },
+            {
+                'name': 'Bing',
+                'url': 'http://www.bing.com/search?q=athlete&qs=n&form=QBLH&sp=-1&pq=athlete&sc=8-7&sk=&cvid=53830DD7FB2E47B7A5D9CF27F106BC9A',
+                'domain': 'bing.com',
+                'secure': False,
+                'value': 3
+            },
+            {
+                'name': 'Duck Duck Go',
+                'url': 'https://duckduckgo.com/?q=plane&t=h_&ia=web',
+                'domain': 'duckduckgo.com',
+                'secure': True,
+                'value': 2
+            },
+        ]
+
+        expected = [
+            {
+                'name': 'Google',
+                'url': 'https://www.google.co.uk',
+                'domain': 'google.co.uk',
+                'secure': True,
+                'value': 5
+            },
+            {
+                'name': 'Facebook',
+                'url': 'http://developers.facebook.com/blog/post/2018/10/02/facebook-login-update/',
+                'domain': 'facebook.com',
+                'secure': False,
+                'value': 4
+            },
+            {
+                'name': 'Bing',
+                'url': 'http://www.bing.com/search?q=athlete&qs=n&form=QBLH&sp=-1&pq=athlete&sc=8-7&sk=&cvid=53830DD7FB2E47B7A5D9CF27F106BC9A',
+                'domain': 'bing.com',
+                'secure': False,
+                'value': 3
+            },
+            {
+                'name': 'Duck Duck Go',
+                'url': 'https://duckduckgo.com/?q=plane&t=h_&ia=web',
+                'domain': 'duckduckgo.com',
+                'secure': True,
+                'value': 2
+            },
+        ]
+        _data_extractor = DataExtractor(test_data)
+        assert _data_extractor.cleanse_data() == expected
+
+    def test_get_value_sum(self):
+        assert data_extractor.get_value_sum() == 23
+
+    def test_get_value_sum_empty_data_set(self):
+        _data_extractor = DataExtractor([])
+        assert _data_extractor.get_value_sum() == 0