Skip to content

Commit

Permalink
added excel (xlsx) attachments
Browse files Browse the repository at this point in the history
  • Loading branch information
robertdavidwest committed Jan 2, 2019
1 parent f926c2f commit 228ef2d
Show file tree
Hide file tree
Showing 7 changed files with 47 additions and 15 deletions.
13 changes: 10 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,15 @@ from google_api import gmail
# get all attachments from e-mails containing 'test'
search_query = "test"
service = gmail.get_gmail_service(GMAIL_CREDENTIALS_PATH, GMAIL_TOKEN_PATH)
csv_dfs = gmail.query_for_csv_attachments(service, search_query)
print(csv_dfs)

results = gmail.query_for_csv_or_xl_attachments(service, search_query)

# 1st Attachment found:
item = results[0]
df = item['data']
print('email: ' + item['emailsubject'])
print('filename: ' + item['filename'])
print("data sample: ")
print(df.head())

```
1 change: 1 addition & 0 deletions environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,4 @@ dependencies:
- pandas
- oauth2client
- google-api-python-client
- xlrd
9 changes: 8 additions & 1 deletion example.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,12 @@
search_query = "Encrave"
service = gmail.get_gmail_service(GMAIL_CREDENTIALS_PATH,
GMAIL_TOKEN_PATH)
csv_dfs = gmail.query_for_csv_attachments(service, search_query)
csvs_and_excel = gmail.query_for_csv_or_xl_attachments(service, search_query)

# 1st Attachment found:
item = csvs_and_excel[0]
df = item['data']
print('email: ' + item['emailsubject'])
print('filename: ' + item['filename'])
print("data sample: ")
print(df.head())
2 changes: 1 addition & 1 deletion google_api/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
import gmail

__version__ = "0.0.1"
__version__ = "0.0.2"
1 change: 1 addition & 0 deletions google_api/config.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
# If modifying these scopes, delete the file token.json.
SCOPES = 'https://www.googleapis.com/auth/gmail.readonly'
CSV_MIME_TYPE = 'text/csv'
XLSX_MIME_TYPE = 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'

34 changes: 25 additions & 9 deletions google_api/gmail.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,19 @@

from config import (
SCOPES,
CSV_MIME_TYPE
CSV_MIME_TYPE,
XLSX_MIME_TYPE
)


def mime_type_to_dtype(s):
if s == CSV_MIME_TYPE:
return 'csv'
if s == XLSX_MIME_TYPE:
return 'xlsx'
raise AssertionError("mime type not accepted")


def get_gmail_service(credentials_path, token_path):
store = file.Storage(token_path)
creds = store.get()
Expand Down Expand Up @@ -52,9 +62,12 @@ def _get_attachment_from_part(service, messageId, part):
return _get_attachment_data(service, messageId, attachmentId)


def _convert_attachment_data_to_dataframe(data):
str_csv = base64.urlsafe_b64decode(data.encode('UTF-8'))
df = pd.read_csv(StringIO(str_csv))
def _convert_attachment_data_to_dataframe(data, data_type):
str_decoded = base64.urlsafe_b64decode(data.encode('UTF-8'))
if data_type == 'csv':
df = pd.read_csv(StringIO(str_decoded))
elif data_type == 'xlsx':
df = pd.read_excel(StringIO(str_decoded))
return df


Expand All @@ -68,7 +81,7 @@ def _flatten_nested_email_parts(parts):
return all_parts


def get_csv_attachments_from_msg_id(service, messageId):
def get_csv_or_xl_attachments_from_msg_id(service, messageId):
"""returns a dict of all CSV attachments as pd.DataFrames
in the email associated with `messageId`. The keys for the
dictionary are the csv filenames"""
Expand All @@ -79,18 +92,21 @@ def get_csv_attachments_from_msg_id(service, messageId):
if not msg_parts:
return []
msg_parts = _flatten_nested_email_parts(msg_parts)
att_parts = [p for p in msg_parts if p['mimeType']==CSV_MIME_TYPE]
att_parts = [p for p in msg_parts if p['mimeType'] in [
CSV_MIME_TYPE, XLSX_MIME_TYPE]]
types = [mime_type_to_dtype(p['mimeType']) for p in att_parts]
filenames = [p['filename'] for p in att_parts]
datas = [_get_attachment_from_part(service, messageId, p) for p in att_parts]
dfs = [_convert_attachment_data_to_dataframe(d) for d in datas]
dfs = [_convert_attachment_data_to_dataframe(d, t)
for d, t in zip(datas, types)]
return [{'emailsubject': subject, 'filename': f, 'data': d}
for f, d in zip(filenames, dfs)]


def query_for_csv_attachments(service, search_query):
def query_for_csv_or_xl_attachments(service, search_query):
message_ids = query_for_message_ids(service, search_query)
csvs = []
for msg_id in message_ids:
loop_csvs = get_csv_attachments_from_msg_id(service, msg_id)
loop_csvs = get_csv_or_xl_attachments_from_msg_id(service, msg_id)
csvs.extend(loop_csvs)
return csvs
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from setuptools import setup
from google_api import __version__
setup(name='google_api',
setup(name='google_api_rdw',
version=__version__,
description='Connect to the google api',
url='https://github.com/robertdavidwest/google_api',
Expand Down

0 comments on commit 228ef2d

Please sign in to comment.