1
- import os
1
+ import pathlib
2
2
import wget
3
3
import pandas as pd
4
4
import utils .cazy_functions as func
5
5
6
6
# Working directory
7
- home_dir = os . getcwd ()
8
- downloads_dir = f' { home_dir } / downloads'
9
- results_dir = f' { home_dir } / results'
10
- pages_dir = f' { downloads_dir } / cbm_pages'
7
+ cwd = pathlib . Path . cwd ()
8
+ downloads_dir = cwd . joinpath ( ' downloads')
9
+ results_dir = cwd . joinpath ( ' results')
10
+ pages_dir = downloads_dir . joinpath ( ' cbm_pages')
11
11
12
12
# Make working directories if they don't exist
13
13
def dir_setup ():
14
14
func .dir_exists (downloads_dir )
15
15
func .dir_exists (results_dir )
16
16
func .dir_exists (pages_dir )
17
- return None
18
-
19
17
20
18
# Download all cbm pages listed on CAZy database
21
19
def wget_CAZy ():
22
20
# Download the main CAZy CBM page
23
- if not os .path .isfile (f'{ downloads_dir } /Carbohydrate-Binding-Modules.html' ):
24
- wget .download (url = 'http://www.cazy.org/Carbohydrate-Binding-Modules.html' , out = downloads_dir )
21
+ cbm_home_page = downloads_dir .joinpath ('Carbohydrate-Binding-Modules.html' )
22
+ if not cbm_home_page .is_file ():
23
+ wget .download (url = 'http://www.cazy.org/Carbohydrate-Binding-Modules.html' , out = str (downloads_dir ))
25
24
26
25
# Read html tables into a list of pandas DataFrames
27
- html_table = pd .read_html (f' { downloads_dir } /Carbohydrate-Binding-Modules.html' )
26
+ html_table = pd .read_html (cbm_home_page )
28
27
29
28
# Convert DataFrame into list of rows
30
29
rows = html_table [0 ].values .tolist ()
@@ -39,43 +38,49 @@ def wget_CAZy():
39
38
40
39
# Download all CBM pages that are not already downloaded
41
40
for i in int_list :
42
- if not os .path .isfile (f'{ pages_dir } /CBM{ i } .html' ):
43
- wget .download (url = f'http://www.cazy.org/CBM{ i } .html' , out = pages_dir )
41
+ ## Use if you only want to download missing pages
42
+ # if not pages_dir.joinpath(f'CBM{i}.html').is_file():
43
+ # wget.download(url=f'http://www.cazy.org/CBM{i}.html', out=str(pages_dir))
44
+
45
+ ## Use if re-downloading/updating already downloaded files (recommended)
46
+ wget .download (url = f'http://www.cazy.org/CBM{ i } .html' , out = str (pages_dir ))
44
47
45
48
# Convert HTML tables inside CAZy pages to an excel spreadsheet
46
49
def html_to_excel ():
47
50
substrate_dict = {}
48
- # fold_dict = {}
51
+
49
52
# Create list of all html files in current directory
50
- html_files = [file for file in os . listdir (pages_dir ) if file . endswith ( ' .html' )]
53
+ html_files = [file for file in pathlib . Path (pages_dir ). glob ( '* .html') if file . is_file ( )]
51
54
html_files .sort ()
52
55
53
56
# Set up pages excel file writing
54
- pages_writer = pd .ExcelWriter (f' { results_dir } / CAZy Pages.xlsx' , engine = 'xlsxwriter' )
57
+ pages_writer = pd .ExcelWriter (results_dir . joinpath ( ' CAZy Pages.xlsx') , engine = 'xlsxwriter' )
55
58
56
59
# Convert html tables to sheets within an excel file
57
60
for file in html_files :
58
61
# Read table(s) from html file into a list of DataFrames
59
- html_table = pd .read_html (f' { pages_dir } / { file } ' )
62
+ html_table = pd .read_html (file )
60
63
61
64
# Extract DataFrame from list
62
65
html_df = html_table [0 ]
63
66
64
67
# Write DataFrame to sheet in output excel file
65
- html_df .to_excel (pages_writer , sheet_name = file .strip ('.html' ), index = False , header = False )
68
+ # file_name_from_path = str(file).split('/')[-1]
69
+ file_name_from_path = str (file ).rsplit ('/' , maxsplit = 1 )[- 1 ]
70
+ html_df .to_excel (pages_writer , sheet_name = file_name_from_path .strip ('.html' ), index = False , header = False )
66
71
67
72
# Add Activity row to each directory
68
73
labelled_html_df = html_df .set_index (0 )
69
74
70
75
# Use Note row if Activities row is empty
71
76
if isinstance (labelled_html_df [1 ].loc ['Activities in Family' ], float ) is True :
72
- substrate_dict [file .strip ('.html' )] = labelled_html_df [1 ].loc ['Note' ]
77
+ substrate_dict [file_name_from_path .strip ('.html' )] = labelled_html_df [1 ].loc ['Note' ]
73
78
else :
74
- substrate_dict [file .strip ('.html' )] = labelled_html_df [1 ].loc ['Activities in Family' ]
79
+ substrate_dict [file_name_from_path .strip ('.html' )] = labelled_html_df [1 ].loc ['Activities in Family' ]
75
80
pages_writer .close ()
76
81
77
82
# Set up cbm table excel file writing
78
- cbm_table_writer = pd .ExcelWriter (f' { results_dir } / CAZy CBM Table.xlsx' , engine = 'xlsxwriter' )
83
+ cbm_table_writer = pd .ExcelWriter (results_dir . joinpath ( ' CAZy CBM Table.xlsx') , engine = 'xlsxwriter' )
79
84
80
85
# Convert CBM dictionary to a DataFrame
81
86
cbm_df = pd .DataFrame .from_dict (substrate_dict , orient = 'index' )
0 commit comments