Skip to content

Commit f04bf95

Browse files
committed
os swapped for pathlib
1 parent 99a8878 commit f04bf95

File tree

3 files changed

+64
-44
lines changed

3 files changed

+64
-44
lines changed

main.py

Lines changed: 23 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,30 +1,45 @@
1-
import glob, os
1+
import pathlib
22
import utils.cazy_functions as func
33
import utils.cazy_parse as parse
44
import utils.database_trim as trim
55

66
# Working Directories
7-
home_dir = os.getcwd()
8-
downloads_dir = f'{home_dir}/downloads'
9-
results_dir = f'{home_dir}/results'
7+
cwd = pathlib.Path.cwd()
8+
downloads_dir = cwd.joinpath('downloads')
9+
results_dir = cwd.joinpath('results')
1010

1111
def main():
1212
# Set up directory structure
1313
parse.dir_setup()
14+
print('dir_setup() done')
15+
1416
# Download all CBM pages
1517
parse.wget_CAZy()
18+
print('wget_cazy() done')
19+
1620
# Convert/combine html tables into an excel file
1721
parse.html_to_excel()
22+
print('html_to_excel() done')
23+
1824
# Download cazy_data.zip
1925
db_zip = trim.wget_database()
26+
print('wget_database() done')
27+
2028
# Extract cazy_data.zip
2129
func.unzip(zip_file=db_zip, output_dir=downloads_dir)
30+
print('unzip() done')
31+
2232
# Search for cazy_data.txt in the downloads folder
23-
for file in glob.glob(f'{home_dir}/downloads/**/cazy_data.txt', recursive=True):
24-
if os.path.isfile(file):
25-
# Read the database file and extract the lines relevant to CBMs
33+
for file in downloads_dir.glob('**/cazy_data.txt'):
34+
if pathlib.Path.is_file(file):
35+
# Read the database file and extract the lines relevant to CBMs
2636
trim.db_trim(file)
27-
print('Done.')
37+
break
38+
print('db_trim() done')
39+
40+
# Delete any .tmp files created by python3-wget
41+
func.clean_up(cwd)
42+
print('clean_up() done\n\nCAZy Database Parser complete!')
2843

2944
if __name__ == '__main__':
3045
main()

utils/cazy_parse.py

Lines changed: 25 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,30 +1,29 @@
1-
import os
1+
import pathlib
22
import wget
33
import pandas as pd
44
import utils.cazy_functions as func
55

66
# Working directory
7-
home_dir = os.getcwd()
8-
downloads_dir = f'{home_dir}/downloads'
9-
results_dir = f'{home_dir}/results'
10-
pages_dir = f'{downloads_dir}/cbm_pages'
7+
cwd = pathlib.Path.cwd()
8+
downloads_dir = cwd.joinpath('downloads')
9+
results_dir = cwd.joinpath('results')
10+
pages_dir = downloads_dir.joinpath('cbm_pages')
1111

1212
# Make working directories if they don't exist
1313
def dir_setup():
1414
func.dir_exists(downloads_dir)
1515
func.dir_exists(results_dir)
1616
func.dir_exists(pages_dir)
17-
return None
18-
1917

2018
# Download all cbm pages listed on CAZy database
2119
def wget_CAZy():
2220
# Download the main CAZy CBM page
23-
if not os.path.isfile(f'{downloads_dir}/Carbohydrate-Binding-Modules.html'):
24-
wget.download(url='http://www.cazy.org/Carbohydrate-Binding-Modules.html', out=downloads_dir)
21+
cbm_home_page = downloads_dir.joinpath('Carbohydrate-Binding-Modules.html')
22+
if not cbm_home_page.is_file():
23+
wget.download(url='http://www.cazy.org/Carbohydrate-Binding-Modules.html', out=str(downloads_dir))
2524

2625
# Read html tables into a list of pandas DataFrames
27-
html_table = pd.read_html(f'{downloads_dir}/Carbohydrate-Binding-Modules.html')
26+
html_table = pd.read_html(cbm_home_page)
2827

2928
# Convert DataFrame into list of rows
3029
rows = html_table[0].values.tolist()
@@ -39,43 +38,49 @@ def wget_CAZy():
3938

4039
# Download all CBM pages that are not already downloaded
4140
for i in int_list:
42-
if not os.path.isfile(f'{pages_dir}/CBM{i}.html'):
43-
wget.download(url=f'http://www.cazy.org/CBM{i}.html', out=pages_dir)
41+
## Use if you only want to download missing pages
42+
# if not pages_dir.joinpath(f'CBM{i}.html').is_file():
43+
# wget.download(url=f'http://www.cazy.org/CBM{i}.html', out=str(pages_dir))
44+
45+
## Use if re-downloading/updating already downloaded files (recommended)
46+
wget.download(url=f'http://www.cazy.org/CBM{i}.html', out=str(pages_dir))
4447

4548
# Convert HTML tables inside CAZy pages to an excel spreadsheet
4649
def html_to_excel():
4750
substrate_dict = {}
48-
# fold_dict = {}
51+
4952
# Create list of all html files in current directory
50-
html_files = [file for file in os.listdir(pages_dir) if file.endswith('.html')]
53+
html_files = [file for file in pathlib.Path(pages_dir).glob('*.html') if file.is_file()]
5154
html_files.sort()
5255

5356
# Set up pages excel file writing
54-
pages_writer = pd.ExcelWriter(f'{results_dir}/CAZy Pages.xlsx', engine='xlsxwriter')
57+
pages_writer = pd.ExcelWriter(results_dir.joinpath('CAZy Pages.xlsx'), engine='xlsxwriter')
5558

5659
# Convert html tables to sheets within an excel file
5760
for file in html_files:
5861
# Read table(s) from html file into a list of DataFrames
59-
html_table = pd.read_html(f'{pages_dir}/{file}')
62+
html_table = pd.read_html(file)
6063

6164
# Extract DataFrame from list
6265
html_df = html_table[0]
6366

6467
# Write DataFrame to sheet in output excel file
65-
html_df.to_excel(pages_writer, sheet_name=file.strip('.html'), index=False, header=False)
68+
# file_name_from_path = str(file).split('/')[-1]
69+
file_name_from_path = str(file).rsplit('/', maxsplit=1)[-1]
70+
html_df.to_excel(pages_writer, sheet_name=file_name_from_path.strip('.html'), index=False, header=False)
6671

6772
# Add Activity row to each directory
6873
labelled_html_df = html_df.set_index(0)
6974

7075
# Use Note row if Activities row is empty
7176
if isinstance(labelled_html_df[1].loc['Activities in Family'], float) is True:
72-
substrate_dict[file.strip('.html')] = labelled_html_df[1].loc['Note']
77+
substrate_dict[file_name_from_path.strip('.html')] = labelled_html_df[1].loc['Note']
7378
else:
74-
substrate_dict[file.strip('.html')] = labelled_html_df[1].loc['Activities in Family']
79+
substrate_dict[file_name_from_path.strip('.html')] = labelled_html_df[1].loc['Activities in Family']
7580
pages_writer.close()
7681

7782
# Set up cbm table excel file writing
78-
cbm_table_writer = pd.ExcelWriter(f'{results_dir}/CAZy CBM Table.xlsx', engine='xlsxwriter')
83+
cbm_table_writer = pd.ExcelWriter(results_dir.joinpath('CAZy CBM Table.xlsx'), engine='xlsxwriter')
7984

8085
# Convert CBM dictionary to a DataFrame
8186
cbm_df = pd.DataFrame.from_dict(substrate_dict, orient='index')

utils/database_trim.py

Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,27 +1,27 @@
1-
import os
2-
from pathlib import Path
1+
import pathlib
2+
from collections import defaultdict
33
import pandas as pd
44
import wget
5-
from collections import defaultdict
65
import utils.cazy_functions as func
76

8-
# Current working directory
9-
home_dir = os.getcwd()
10-
downloads_dir = f'{home_dir}/downloads'
11-
results_dir = f'{home_dir}/results'
7+
# Working directory
8+
cwd = pathlib.Path.cwd()
9+
downloads_dir = cwd.joinpath('downloads')
10+
results_dir = cwd.joinpath('results')
11+
pages_dir = downloads_dir.joinpath('cbm_pages')
1212

1313
# Download the cazy_data file to the CAZy_database directory
1414
def wget_database():
1515
# Make CAZy_database directory if it does not exist
1616
func.dir_exists(downloads_dir)
1717
# Download cazy_data.zip to CAZy_database if it does not exist
18-
database_zip = Path(f'{downloads_dir}/cazy_data.zip')
18+
database_zip = downloads_dir.joinpath('cazy_data.zip')
1919
if database_zip.is_file():
2020
pass
2121
else:
22-
wget.download(url='http://www.cazy.org/IMG/cazy_data/cazy_data.zip', out=downloads_dir)
22+
wget.download(url='http://www.cazy.org/IMG/cazy_data/cazy_data.zip', out=str(downloads_dir))
2323
# Return path to cazy_data.zip
24-
return f'{downloads_dir}/cazy_data.zip'
24+
return database_zip
2525

2626
# Read the database file and extract the lines relevant to CBMs
2727
def db_trim(database_file:str):
@@ -32,7 +32,7 @@ def db_trim(database_file:str):
3232
'GenBank Accession Number': []
3333
}
3434
family_list = []
35-
35+
3636
with open(database_file, 'r', encoding='utf-8') as input_file:
3737
# Write each line containing 'CBM' in the left column to the trimmed database
3838
for line in input_file:
@@ -49,15 +49,15 @@ def db_trim(database_file:str):
4949

5050
## SHEET 1 ##
5151
# Write out trimmed database
52-
database_writer = pd.ExcelWriter(f'{results_dir}/cazy_data_cbm_only.xlsx', engine='xlsxwriter')
52+
database_writer = pd.ExcelWriter(results_dir.joinpath('cazy_data_cbm_only.xlsx'), engine='xlsxwriter')
5353
cbm_df = pd.DataFrame.from_dict(cbm_database)
5454
cbm_df.to_excel(database_writer, sheet_name='CAZy Database CBMs', index=False, header=True)
5555

5656
## SHEET 2 ##
5757
# Count the occurrences of each CBM family in the trimmed
5858
family_counter = defaultdict(int)
59-
for f in cbm_database['Family']:
60-
family_counter[f] += 1
59+
for family in cbm_database['Family']:
60+
family_counter[family] += 1
6161
family_counter = dict(family_counter)
6262

6363
# Add any missing families to the counter dictionary
@@ -74,8 +74,8 @@ def db_trim(database_file:str):
7474
## SHEET 3 ##
7575
# Count the occurrences of each Domain in the trimmed database
7676
domain_counter = defaultdict(int)
77-
for d in cbm_database['Domain']:
78-
domain_counter[d] += 1
77+
for domain in cbm_database['Domain']:
78+
domain_counter[domain] += 1
7979
domain_counter = dict(domain_counter)
8080

8181
domain_counter = {domain: count for domain, count in sorted(domain_counter.items(), key=lambda item: item[1], reverse=True)}

0 commit comments

Comments
 (0)