Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Web data extraction tool #1

Open
wants to merge 9 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 54 additions & 0 deletions data_extractor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
class DataExtractor:
"""
Use to extract, cleanse, sum and amend incorrect website data collection.
"""
def __init__(self, data):
self.data = data

def find_items(self, value=4):
"""
Find and return a new list of items where key "value" is greater than or equal to parameter value.
:param value: int, value to find items for.
:return: list(dict), list of dictionaries matching the above filtering rule.
"""
return [item for item in self.data if item.get('value') and item.get('value') >= value]

def amend_domain_values(self, prefix='www.'):
"""
Fixes missing parts of the domain names. By default we add missing 'www.'.
:param prefix: str, prefix to add to the domain name.
:return: amended: list(dict), amended list of web records.
"""
amended = []
for item in self.data:
if item.get('domain') and not item.get('domain').startswith(prefix):
item['domain'] = f"{prefix}{item['domain']}"
amended.append(item)
return amended

def cleanse_data(self):
"""
Fix errors in "secure" key values. All urls starting with https should be set to "secure": True, those starting
with http "secure": False.
:return: amended: list(dict), amended list of web records.
"""
amended = []
for item in self.data:
url = item.get('url')
secure = item.get('secure')
if url:
# https marked as secure = False
if url.startswith('https:') and not secure:
item['secure'] = True
# http marked as secure = True
elif url.startswith('http:') and secure:
item['secure'] = False
amended.append(item)
return amended

def get_value_sum(self):
"""
Returns sum of all value keys in the data set.
:return: int, sum of all value keys in the data set.
"""
return sum([item.get('value', 0) for item in self.data])
Empty file added tests/__init__.py
Empty file.
179 changes: 179 additions & 0 deletions tests/test_data_extractor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,179 @@
from data_extractor import DataExtractor
from websites.resources.data import WEBSITES

data_extractor = DataExtractor(WEBSITES)


class TestDataExtractor:

def test_find_items(self):
expected = [
{
'name': 'Google',
'url': 'https://www.google.co.uk',
'domain': 'google.co.uk',
'secure': True,
'value': 5},
{
'name': 'Facebook',
'url': 'https://developers.facebook.com/blog/post/2018/10/02/facebook-login-update/',
'domain': 'facebook.com',
'secure': True,
'value': 4},
{
'name': 'YouTube',
'url': 'https://www.youtube.com/watch?v=09Cd7NKKvDc',
'domain': 'youtube.com',
'secure': True,
'value': 5
}
]
assert data_extractor.find_items() == expected

def test_find_items_none_found(self):
assert data_extractor.find_items(100) == []

def test_find_items_all_matching(self):
assert data_extractor.find_items(1) == WEBSITES

def test_amend_domain_values(self):
expected = [
{
'name': 'Google',
'url': 'https://www.google.co.uk',
'domain': 'www.google.co.uk',
'secure': True,
'value': 5},
{
'name': 'Facebook',
'url': 'https://developers.facebook.com/blog/post/2018/10/02/facebook-login-update/',
'domain': 'www.facebook.com',
'secure': True, 'value': 4},
{
'name': 'Bing',
'url': 'https://www.bing.com/search?q=athlete&qs=n&form=QBLH&sp=-1&pq=athlete&sc=8-7&sk=&cvid=53830DD7FB2E47B7A5D9CF27F106BC9A',
'domain': 'www.bing.com',
'secure': False,
'value': 3
},
{
'name': 'Ask',
'url': 'https://uk.ask.com/web?o=0&l=dir&qo=serpSearchTopBox&q=jupiter',
'domain': 'www.ask.com',
'secure': False,
'value': 1},
{
'name': 'Duck Duck Go',
'url': 'http://duckduckgo.com/?q=plane&t=h_&ia=web',
'domain': 'www.duckduckgo.com',
'secure': True,
'value': 2
},
{
'name': 'Vimeo',
'url': 'https://vimeo.com/53812885',
'domain': 'www.vimeo.com',
'secure': False,
'value': 2
},
{
'name': 'YouTube',
'url': 'https://www.youtube.com/watch?v=09Cd7NKKvDc',
'domain': 'www.youtube.com',
'secure': True,
'value': 5
},
{
'name': 'Daily Motion',
'url': 'http://www.dailymotion.com/search/football',
'domain': 'www.dailymotion.com',
'secure': True,
'value': 1
}
]
assert data_extractor.amend_domain_values() == expected

def test_amend_domain_values_retains_original_if_prefix_matching(self):
test_data = [
{
'name': 'Google',
'url': 'https://www.google.co.uk',
'domain': 'www.google.co.uk',
'secure': True,
'value': 5
}
]
_data_extractor = DataExtractor(test_data)
assert _data_extractor.amend_domain_values() == test_data

def test_cleanse_data(self):
test_data = [
{
'name': 'Google',
'url': 'https://www.google.co.uk',
'domain': 'google.co.uk',
'secure': False,
'value': 5
},
{
'name': 'Facebook',
'url': 'http://developers.facebook.com/blog/post/2018/10/02/facebook-login-update/',
'domain': 'facebook.com',
'secure': True,
'value': 4
},
{
'name': 'Bing',
'url': 'http://www.bing.com/search?q=athlete&qs=n&form=QBLH&sp=-1&pq=athlete&sc=8-7&sk=&cvid=53830DD7FB2E47B7A5D9CF27F106BC9A',
'domain': 'bing.com',
'secure': False,
'value': 3
},
{
'name': 'Duck Duck Go',
'url': 'https://duckduckgo.com/?q=plane&t=h_&ia=web',
'domain': 'duckduckgo.com',
'secure': True,
'value': 2
},
]

expected = [
{
'name': 'Google',
'url': 'https://www.google.co.uk',
'domain': 'google.co.uk',
'secure': True,
'value': 5
},
{
'name': 'Facebook',
'url': 'http://developers.facebook.com/blog/post/2018/10/02/facebook-login-update/',
'domain': 'facebook.com',
'secure': False,
'value': 4
},
{
'name': 'Bing',
'url': 'http://www.bing.com/search?q=athlete&qs=n&form=QBLH&sp=-1&pq=athlete&sc=8-7&sk=&cvid=53830DD7FB2E47B7A5D9CF27F106BC9A',
'domain': 'bing.com',
'secure': False,
'value': 3
},
{
'name': 'Duck Duck Go',
'url': 'https://duckduckgo.com/?q=plane&t=h_&ia=web',
'domain': 'duckduckgo.com',
'secure': True,
'value': 2
},
]
_data_extractor = DataExtractor(test_data)
assert _data_extractor.cleanse_data() == expected

def test_get_value_sum(self):
assert data_extractor.get_value_sum() == 23

def test_get_value_sum_empty_data_set(self):
_data_extractor = DataExtractor([])
assert _data_extractor.get_value_sum() == 0