os swapped for pathlib

crfield18 · crfield18 · commit f04bf95c8bc3 · 2023-03-16T14:13:30.000Z
diff --git a/main.py b/main.py
@@ -1,30 +1,45 @@
-import glob, os
+import pathlib
 import utils.cazy_functions as func
 import utils.cazy_parse as parse
 import utils.database_trim as trim
 
 # Working Directories
-home_dir = os.getcwd()
-downloads_dir = f'{home_dir}/downloads'
-results_dir = f'{home_dir}/results'
+cwd = pathlib.Path.cwd()
+downloads_dir = cwd.joinpath('downloads')
+results_dir = cwd.joinpath('results')
 
 def main():
     # Set up directory structure
     parse.dir_setup()
+    print('dir_setup() done')
+
     # Download all CBM pages
     parse.wget_CAZy()
+    print('wget_cazy() done')
+
     # Convert/combine html tables into an excel file
     parse.html_to_excel()
+    print('html_to_excel() done')
+
     # Download cazy_data.zip
     db_zip = trim.wget_database()
+    print('wget_database() done')
+
     # Extract cazy_data.zip
     func.unzip(zip_file=db_zip, output_dir=downloads_dir)
+    print('unzip() done')
+
     # Search for cazy_data.txt in the downloads folder
-    for file in glob.glob(f'{home_dir}/downloads/**/cazy_data.txt', recursive=True):
-        if os.path.isfile(file):
-    # Read the database file and extract the lines relevant to CBMs
+    for file in downloads_dir.glob('**/cazy_data.txt'):
+        if pathlib.Path.is_file(file):
+        # Read the database file and extract the lines relevant to CBMs
             trim.db_trim(file)
-    print('Done.')
+            break
+    print('db_trim() done')
+
+    # Delete any .tmp files created by python3-wget
+    func.clean_up(cwd)
+    print('clean_up() done\n\nCAZy Database Parser complete!')
 
 if __name__ == '__main__':
     main()
diff --git a/utils/cazy_parse.py b/utils/cazy_parse.py
@@ -1,30 +1,29 @@
-import os
+import pathlib
 import wget
 import pandas as pd
 import utils.cazy_functions as func
 
 # Working directory
-home_dir = os.getcwd()
-downloads_dir = f'{home_dir}/downloads'
-results_dir = f'{home_dir}/results'
-pages_dir = f'{downloads_dir}/cbm_pages'
+cwd = pathlib.Path.cwd()
+downloads_dir = cwd.joinpath('downloads')
+results_dir = cwd.joinpath('results')
+pages_dir = downloads_dir.joinpath('cbm_pages')
 
 # Make working directories if they don't exist
 def dir_setup():
     func.dir_exists(downloads_dir)
     func.dir_exists(results_dir)
     func.dir_exists(pages_dir)
-    return None
-
 
 # Download all cbm pages listed on CAZy database
 def wget_CAZy():
     # Download the main CAZy CBM page
-    if not os.path.isfile(f'{downloads_dir}/Carbohydrate-Binding-Modules.html'):
-        wget.download(url='http://www.cazy.org/Carbohydrate-Binding-Modules.html', out=downloads_dir)
+    cbm_home_page = downloads_dir.joinpath('Carbohydrate-Binding-Modules.html')
+    if not cbm_home_page.is_file():
+        wget.download(url='http://www.cazy.org/Carbohydrate-Binding-Modules.html', out=str(downloads_dir))
 
     # Read html tables into a list of pandas DataFrames
-    html_table = pd.read_html(f'{downloads_dir}/Carbohydrate-Binding-Modules.html')
+    html_table = pd.read_html(cbm_home_page)
 
     # Convert DataFrame into list of rows
     rows = html_table[0].values.tolist()
@@ -39,43 +38,49 @@ def wget_CAZy():
 
     # Download all CBM pages that are not already downloaded
     for i in int_list:
-        if not os.path.isfile(f'{pages_dir}/CBM{i}.html'):
-            wget.download(url=f'http://www.cazy.org/CBM{i}.html', out=pages_dir)
+    ## Use if you only want to download missing pages
+    #     if not pages_dir.joinpath(f'CBM{i}.html').is_file():
+    #         wget.download(url=f'http://www.cazy.org/CBM{i}.html', out=str(pages_dir))
+
+    ## Use if re-downloading/updating already downloaded files (recommended)
+        wget.download(url=f'http://www.cazy.org/CBM{i}.html', out=str(pages_dir))
 
 # Convert HTML tables inside CAZy pages to an excel spreadsheet
 def html_to_excel():
     substrate_dict = {}
-    # fold_dict = {}
+
     # Create list of all html files in current directory
-    html_files = [file for file in os.listdir(pages_dir) if file.endswith('.html')]
+    html_files = [file for file in pathlib.Path(pages_dir).glob('*.html') if file.is_file()]
     html_files.sort()
 
     # Set up pages excel file writing
-    pages_writer = pd.ExcelWriter(f'{results_dir}/CAZy Pages.xlsx', engine='xlsxwriter')
+    pages_writer = pd.ExcelWriter(results_dir.joinpath('CAZy Pages.xlsx'), engine='xlsxwriter')
 
     # Convert html tables to sheets within an excel file
     for file in html_files:
         # Read table(s) from html file into a list of DataFrames
-        html_table = pd.read_html(f'{pages_dir}/{file}')
+        html_table = pd.read_html(file)
 
         # Extract DataFrame from list
         html_df = html_table[0]
 
         # Write DataFrame to sheet in output excel file
-        html_df.to_excel(pages_writer, sheet_name=file.strip('.html'), index=False, header=False)
+        # file_name_from_path = str(file).split('/')[-1]
+        file_name_from_path = str(file).rsplit('/', maxsplit=1)[-1]
+        html_df.to_excel(pages_writer, sheet_name=file_name_from_path.strip('.html'), index=False, header=False)
 
         # Add Activity row to each directory
         labelled_html_df = html_df.set_index(0)
 
         # Use Note row if Activities row is empty
         if isinstance(labelled_html_df[1].loc['Activities in Family'], float) is True:
-            substrate_dict[file.strip('.html')] = labelled_html_df[1].loc['Note']
+            substrate_dict[file_name_from_path.strip('.html')] = labelled_html_df[1].loc['Note']
         else:
-            substrate_dict[file.strip('.html')] = labelled_html_df[1].loc['Activities in Family']
+            substrate_dict[file_name_from_path.strip('.html')] = labelled_html_df[1].loc['Activities in Family']
     pages_writer.close()
 
     # Set up cbm table excel file writing
-    cbm_table_writer = pd.ExcelWriter(f'{results_dir}/CAZy CBM Table.xlsx', engine='xlsxwriter')
+    cbm_table_writer = pd.ExcelWriter(results_dir.joinpath('CAZy CBM Table.xlsx'), engine='xlsxwriter')
 
     # Convert CBM dictionary to a DataFrame
     cbm_df = pd.DataFrame.from_dict(substrate_dict, orient='index')
diff --git a/utils/database_trim.py b/utils/database_trim.py
@@ -1,27 +1,27 @@
-import os
-from pathlib import Path
+import pathlib
+from collections import defaultdict
 import pandas as pd
 import wget
-from collections import defaultdict
 import utils.cazy_functions as func
 
-# Current working directory
-home_dir = os.getcwd()
-downloads_dir = f'{home_dir}/downloads'
-results_dir = f'{home_dir}/results'
+# Working directory
+cwd = pathlib.Path.cwd()
+downloads_dir = cwd.joinpath('downloads')
+results_dir = cwd.joinpath('results')
+pages_dir = downloads_dir.joinpath('cbm_pages')
 
 # Download the cazy_data file to the CAZy_database directory
 def wget_database():
     # Make CAZy_database directory if it does not exist
     func.dir_exists(downloads_dir)
     # Download cazy_data.zip to CAZy_database if it does not exist
-    database_zip = Path(f'{downloads_dir}/cazy_data.zip')
+    database_zip = downloads_dir.joinpath('cazy_data.zip')
     if database_zip.is_file():
         pass
     else:
-        wget.download(url='http://www.cazy.org/IMG/cazy_data/cazy_data.zip', out=downloads_dir)
+        wget.download(url='http://www.cazy.org/IMG/cazy_data/cazy_data.zip', out=str(downloads_dir))
     # Return path to cazy_data.zip
-    return f'{downloads_dir}/cazy_data.zip'
+    return database_zip
 
 # Read the database file and extract the lines relevant to CBMs
 def db_trim(database_file:str):
@@ -32,7 +32,7 @@ def db_trim(database_file:str):
         'GenBank Accession Number': []
     }
     family_list = []
-    
+
     with open(database_file, 'r', encoding='utf-8') as input_file:
         # Write each line containing 'CBM' in the left column to the trimmed database
         for line in input_file:
@@ -49,15 +49,15 @@ def db_trim(database_file:str):
 
     ## SHEET 1 ##
     # Write out trimmed database
-    database_writer = pd.ExcelWriter(f'{results_dir}/cazy_data_cbm_only.xlsx', engine='xlsxwriter')
+    database_writer = pd.ExcelWriter(results_dir.joinpath('cazy_data_cbm_only.xlsx'), engine='xlsxwriter')
     cbm_df = pd.DataFrame.from_dict(cbm_database)
     cbm_df.to_excel(database_writer, sheet_name='CAZy Database CBMs', index=False, header=True)
 
     ## SHEET 2 ##
     # Count the occurrences of each CBM family in the trimmed
     family_counter = defaultdict(int)
-    for f in cbm_database['Family']:
-        family_counter[f] += 1
+    for family in cbm_database['Family']:
+        family_counter[family] += 1
     family_counter = dict(family_counter)
 
     # Add any missing families to the counter dictionary
@@ -74,8 +74,8 @@ def db_trim(database_file:str):
     ## SHEET 3 ##
     # Count the occurrences of each Domain in the trimmed database
     domain_counter = defaultdict(int)
-    for d in cbm_database['Domain']:
-        domain_counter[d] += 1
+    for domain in cbm_database['Domain']:
+        domain_counter[domain] += 1
     domain_counter = dict(domain_counter)
 
     domain_counter = {domain: count for domain, count in sorted(domain_counter.items(), key=lambda item: item[1], reverse=True)}