Skip to content

Commit 3978df4

Browse files
committed
fix: some email validation and encoding by latin-1
1 parent c35bcae commit 3978df4

File tree

2 files changed

+43
-6
lines changed

2 files changed

+43
-6
lines changed

scripts/assignment_validation.py

Lines changed: 27 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -14,18 +14,18 @@
1414
"""
1515
import csv
1616
from collections import defaultdict, Counter
17+
from email.utils import parseaddr
1718

1819
import click
1920

20-
INPUT_FIELDNAMES = ['email', 'university_name']
21+
INPUT_FIELDNAMES = ['university_name', 'email']
2122

2223

2324
def _iterate_csv(input_file):
24-
with open(input_file, 'r') as f_in:
25+
with open(input_file, 'r', encoding='latin-1') as f_in:
2526
reader = csv.DictReader(f_in, fieldnames=INPUT_FIELDNAMES, delimiter=',')
2627
# read and skip the header
2728
next(reader, None)
28-
breakpoint()
2929
for row in reader:
3030
yield row
3131

@@ -42,7 +42,7 @@ def print_duplicates(input_file):
4242

4343
for email, uni_list in unis_by_email.items():
4444
if len(uni_list) > 1:
45-
print(email, uni_list)
45+
print(email or 'THE EMPTY STRING', 'is contained in', len(uni_list), 'different rows')
4646

4747

4848
@click.command()
@@ -59,13 +59,36 @@ def print_plan_counts(input_file):
5959
print(plan, count)
6060

6161

62+
def is_valid_email(email):
63+
_, address = parseaddr(email)
64+
if not address:
65+
return False
66+
return True
67+
68+
69+
@click.command()
70+
@click.option(
71+
'--input-file',
72+
help='Path of local file containing email addresses to assign.',
73+
)
74+
def validate_emails(input_file):
75+
invalid_emails = Counter()
76+
for row in _iterate_csv(input_file):
77+
if not is_valid_email(row['email']):
78+
invalid_emails[row['email']] += 1
79+
80+
print(f'There were {sum(invalid_emails.values())} invalid emails')
81+
print(invalid_emails)
82+
83+
6284
@click.group()
6385
def run():
6486
pass
6587

6688

6789
run.add_command(print_duplicates)
6890
run.add_command(print_plan_counts)
91+
run.add_command(validate_emails)
6992

7093

7194
if __name__ == '__main__':

scripts/local_assignment_multi.py

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,9 @@
4040
import csv
4141
import json
4242
import os
43+
import re
4344
import time
45+
from email.utils import parseaddr
4446
from pprint import pprint
4547

4648
import click
@@ -146,6 +148,11 @@ def get_plan_uuids_by_name(plans_by_name_file):
146148
return plans_by_name
147149

148150

151+
def is_valid_email(email):
152+
_, address = parseaddr(email)
153+
return bool(address)
154+
155+
149156
def get_email_chunks(input_file_path, plans_by_name, chunk_size=DEFAULT_CHUNK_SIZE):
150157
"""
151158
Yield chunks of (chunk_id, subscription_plan, email) from the given input file.
@@ -159,14 +166,21 @@ def get_email_chunks(input_file_path, plans_by_name, chunk_size=DEFAULT_CHUNK_SI
159166
current_chunk = []
160167
chunk_id = 0
161168
current_subscription_plan_uuid = None
162-
with open(input_file_path, 'r') as f_in:
169+
# CSVs can contain non-ascii characters, latin-1
170+
# is the encoding that currently works with our production input.
171+
# could eventually be parameterized as input to this command.
172+
with open(input_file_path, 'r', encoding='latin-1') as f_in:
163173
reader = csv.DictReader(f_in, fieldnames=INPUT_FIELDNAMES, delimiter=',')
164174

165175
# read and skip the header
166176
next(reader)
167177

168178
for row in reader:
169179
email = row['email']
180+
if not is_valid_email(email):
181+
print("Invalid email:", email)
182+
continue
183+
170184
university_name = row['university_name']
171185
subscription_plan_uuid = plans_by_name[university_name]
172186

@@ -198,7 +212,7 @@ def get_email_chunks(input_file_path, plans_by_name, chunk_size=DEFAULT_CHUNK_SI
198212

199213
def _post_assignments(subscription_plan_uuid, emails_for_chunk, environment='local', fetch_jwt=False):
200214
"""
201-
Maket the POST request to assign licenses.
215+
Make the POST request to assign licenses.
202216
"""
203217
url_pattern = ENVIRONMENTS[environment]
204218
url = url_pattern.format(subscription_plan_uuid=subscription_plan_uuid)

0 commit comments

Comments
 (0)