fix random choices taxonomy

JFsanchezherrero · Jun 12, 2021 · 08a59e2 · 08a59e2
1 parent 4245ca0
commit 08a59e2
Show file tree

Hide file tree

Showing 2 changed files with 60 additions and 45 deletions.
diff --git a/code/BacDup/modules/input_parser.py b/code/BacDup/modules/input_parser.py
@@ -16,12 +16,16 @@
 import argparse
 import time
 from Bio import SeqIO
-import HCGB
-from HCGB.functions.aesthetics_functions import debug_message
-import HCGB.functions.time_functions as time_functions
 from termcolor import colored
 import pandas as pd
 
+## import HCGB
+from HCGB.functions.aesthetics_functions import debug_message
+import HCGB.functions.time_functions as HCGB_time
+import HCGB.functions.files_functions as HCGB_files
+import HCGB.functions.aesthetics_functions as HCGB_aes
+import HCGB.functions.main_functions as HCGB_main
+
 ## my modules
 import BacDup
 import BacDup.scripts.gbf_parser as gbf_parser
@@ -47,9 +51,9 @@ def run_input(arg_dict):
  exit()
 
  BacDup_functions.pipeline_header('BacDup')
- HCGB.functions.aesthetics_functions.boxymcboxface("Preparing input files")
+ HCGB_aes.boxymcboxface("Preparing input files")
  print ("--------- Starting Process ---------")
- time_functions.print_time()
+ HCGB_time.print_time()
 
  ## init time
  start_time_total = time.time()
@@ -60,7 +64,7 @@ def run_input(arg_dict):
 
  ## output folder 
  print ("\n+ Create output folder(s):")
- HCGB.functions.files_functions.create_folder(outdir)
+ HCGB_files.create_folder(outdir)
 
  ## set defaults
  if not (arg_dict.assembly_level):
@@ -76,7 +80,7 @@ def run_input(arg_dict):
  else:
  arg_dict.project = True
  print ("+ Generate a directory containing information within the project folder provided")
- final_dir = HCGB.functions.files_functions.create_subfolder("info", outdir)
+ final_dir = HCGB_files.create_subfolder("info", outdir)
 
  ## debug messages
  if (arg_dict.debug):
@@ -90,7 +94,7 @@ def run_input(arg_dict):
 
  ## get files
  print ()
- HCGB.functions.aesthetics_functions.print_sepLine("-",50, False)
+ HCGB_aes.print_sepLine("-",50, False)
  print ('+ Getting input information provided... ')
  print ('+ Several options available:')
  print ('\t* Single/Multiple Annotation file:')
@@ -104,7 +108,7 @@ def run_input(arg_dict):
  time.sleep(1)
 
  ## time stamp
- start_time_partial = time_functions.timestamp(start_time_total)
+ start_time_partial = HCGB_time.timestamp(start_time_total)
 
  #################################################
  ## Parse and obtain the type of input information provided
@@ -116,24 +120,24 @@ def run_input(arg_dict):
  ## 'plasmids_number','plasmids_ID'))
 
  ## time stamp
- start_time_partial = time_functions.timestamp(start_time_partial)
+ start_time_partial = HCGB_time.timestamp(start_time_partial)
 
  ## parse information accordingly
  parse_information(arg_dict, df_accID, outdir)
 
  ### report generation
- HCGB.functions.aesthetics_functions.boxymcboxface("Summarizing input files")
- outdir_report = HCGB.functions.files_functions.create_subfolder("report", outdir)
+ HCGB_aes.boxymcboxface("Summarizing input files")
+ outdir_report = HCGB_files.create_subfolder("report", outdir)
 
- input_report = HCGB.functions.files_functions.create_subfolder("input", outdir_report)
+ input_report = HCGB_files.create_subfolder("input", outdir_report)
 
  ## add df_accID.loc[sample,] information as csv into input folder
  df_accID.to_csv(os.path.join(input_report, 'info.csv'), index=True, header=True)
 
  ## maybe add a summary of the files?
 
  print ("\n*************** Finish *******************")
- start_time_partial = time_functions.timestamp(start_time_total)
+ start_time_partial = HCGB_time.timestamp(start_time_total)
 
  print ("+ Exiting Input module.")
  return()
@@ -142,8 +146,8 @@ def run_input(arg_dict):
 def parse_information(arg_dict, df_accID, outdir):
 
  ### Parse df_accID
- dict_input_folders = HCGB.functions.files_functions.outdir_project(outdir, arg_dict.project, df_accID, "input", arg_dict.debug)
- dict_parse_folders = HCGB.functions.files_functions.outdir_project(outdir, arg_dict.project, df_accID, "parse", arg_dict.debug)
+ dict_input_folders = HCGB_files.outdir_project(outdir, arg_dict.project, df_accID, "input", arg_dict.debug)
+ dict_parse_folders = HCGB_files.outdir_project(outdir, arg_dict.project, df_accID, "parse", arg_dict.debug)
 
  ## debug messages
  if (arg_dict.debug):
@@ -170,34 +174,34 @@ def parse_information(arg_dict, df_accID, outdir):
  print()
  print ("\t+ Parsing sample: " + sample)
 
- if (not HCGB.functions.files_functions.is_non_zero_file(parse_timestamp) and not HCGB.functions.files_functions.is_non_zero_file(input_timestamp)):
+ if (not HCGB_files.is_non_zero_file(parse_timestamp) and not HCGB_files.is_non_zero_file(input_timestamp)):
 
  ## TODO: Set threads to use in parallel
  process_OK = parse_annot_file(sample, folder_input, df_accID.loc[sample, 'annot_file'], dict_parse_folders[sample], arg_dict.debug, df_accID.loc[sample, 'genome'])
 
  if (process_OK):
 
  ## link or copy annotation file into folder_input
- HCGB.functions.files_functions.get_symbolic_link_file(df_accID.loc[sample, 'annot_file'], folder_input)
+ HCGB_files.get_symbolic_link_file(df_accID.loc[sample, 'annot_file'], folder_input)
 
  ## add df_accID.loc[sample,] information as csv into input folder
  df_accID.loc[sample,].to_csv(os.path.join(folder_input, 'info.csv'), index=True, header=True)
 
  ## print time stamp
- time_functions.print_time_stamp(input_timestamp)
+ HCGB_time.print_time_stamp(input_timestamp)
 
  ## print time stamp
- time_functions.print_time_stamp(parse_timestamp)
+ HCGB_time.print_time_stamp(parse_timestamp)
  else:
  print(colored("\t+ Some error occurred for sample %s while parsing input options" %sample, 'red'))
 
  ## print time stamp
- time_functions.print_time_stamp(os.path.join(folder_input, '.fail'))
+ HCGB_time.print_time_stamp(os.path.join(folder_input, '.fail'))
 
  ## print time stamp
- time_functions.print_time_stamp(os.path.join(dict_parse_folders[sample], '.fail'))
+ HCGB_time.print_time_stamp(os.path.join(dict_parse_folders[sample], '.fail'))
  else:
- read_time = time_functions.read_time_stamp(parse_timestamp)
+ read_time = HCGB_time.read_time_stamp(parse_timestamp)
  print (colored("\t+ Input parsing already available for sample %s [%s]" %(sample, read_time), 'green'))
  print()
 
@@ -250,7 +254,7 @@ def parse_annot_file(name, folder_out_input, annot_file, output_path, Debug, ref
 
  elif(format=='gff'):
  print (colored('\t* GFF format file:.......[OK]', 'green'))
- if (HCGB.functions.files_functions.is_non_zero_file(ref_file)):
+ if (HCGB_files.is_non_zero_file(ref_file)):
  return(gff_parser.gff_parser_caller(annot_file, ref_file, output_path, Debug))
  else:
  print(colored("ERROR: No genome reference file provided for this GFF annotation. Check input options provided.","red"))
@@ -293,7 +297,7 @@ def parse_options(arg_dict):
  BacDup_functions.file_readable_check(arg_dict.annot_file)
 
  print (colored('\t* Multiple annotation files provided .......[OK]', 'green'))
- dict_entries = HCGB.functions.main_functions.file2dictionary(arg_dict.annot_file, ',')
+ dict_entries = HCGB_main.file2dictionary(arg_dict.annot_file, ',')
 
  ## debug messages
  if (arg_dict.debug):
@@ -361,7 +365,7 @@ def parse_options(arg_dict):
  BacDup_functions.file_readable_check(arg_dict.ref_file)
 
  if (arg_dict.batch):
- ref_entries = HCGB.functions.main_functions.file2dictionary(arg_dict.ref_file, ',')
+ ref_entries = HCGB_main.file2dictionary(arg_dict.ref_file, ',')
  genome = ref_entries[name]
  else:
  genome = arg_dict.ref_file
@@ -382,9 +386,9 @@ def parse_options(arg_dict):
  elif (arg_dict.GenBank_id):
  ## get database path
  if (arg_dict.db_folder):
- db_folder = HCGB.functions.files_functions.create_folder(os.path.abspath(arg_dict.db_folder))
+ db_folder = HCGB_files.create_folder(os.path.abspath(arg_dict.db_folder))
  else:
- db_folder = HCGB.functions.files_functions.create_subfolder("db", os.path.abspath(arg_dict.output_folder))
+ db_folder = HCGB_files.create_subfolder("db", os.path.abspath(arg_dict.output_folder))
 
  ## debug messages
  if (arg_dict.debug):
@@ -410,7 +414,7 @@ def parse_options(arg_dict):
  print()
 
  ## call IDs into a list and create tmp folder
- strains2get = HCGB.functions.main_functions.readList_fromFile(arg_dict.GenBank_id)
+ strains2get = HCGB_main.readList_fromFile(arg_dict.GenBank_id)
  strains2get = list(filter(None, strains2get))
 
  ## debug messages
@@ -435,7 +439,7 @@ def parse_options(arg_dict):
  ## download
  print (colored('\t* A NCBI GenBank ID:.......[OK]', 'green'))
  print()
- HCGB.functions.aesthetics_functions.print_sepLine("+", 75, False)
+ HCGB_aes.print_sepLine("+", 75, False)
  df_accID = BacDup.scripts.NCBI_downloader.NCBIdownload(arg_dict.GenBank_id, db_folder, arg_dict.debug)
 
  ## --------------------------------------- ##
@@ -457,7 +461,7 @@ def parse_options(arg_dict):
  BacDup_functions.file_readable_check(arg_dict.tax_id)
 
  ## get IDs into a list
- taxIDs2get = HCGB.functions.main_functions.readList_fromFile(arg_dict.tax_id)
+ taxIDs2get = HCGB_main.readList_fromFile(arg_dict.tax_id)
 
  else:
  print (colored('\t* A NCBI Taxonomy ID:.......[OK]', 'green'))
@@ -496,9 +500,9 @@ def parse_options(arg_dict):
  ## get database path
  #################
  if (arg_dict.db_folder):
- db_folder = HCGB.functions.files_functions.create_folder(os.path.abspath(arg_dict.db_folder))
+ db_folder = HCGB_files.create_folder(os.path.abspath(arg_dict.db_folder))
  else:
- db_folder = HCGB.functions.files_functions.create_subfolder("db", outdir)
+ db_folder = HCGB_files.create_subfolder("db", outdir)
 
  ## debug messages 
  if arg_dict.debug:
@@ -518,20 +522,25 @@ def parse_options(arg_dict):
 
  ## print list and dictionary of possible and selected taxIDs
  outdir = os.path.abspath(arg_dict.output_folder)
- final_dir = HCGB.functions.files_functions.create_subfolder("info", outdir)
- input_info_dir = HCGB.functions.files_functions.create_subfolder("input", outdir)
- HCGB.functions.main_functions.printList2file(os.path.join(input_info_dir, 'Downloaded.txt'), strains2get)
- HCGB.functions.main_functions.printList2file(os.path.join(input_info_dir, 'all_entries.txt'), allstrains_available)
+ info_dir = HCGB_files.create_subfolder("info", outdir)
+ input_info_dir = HCGB_files.create_subfolder("input", info_dir)
+ HCGB_main.printList2file(os.path.join(input_info_dir, 'Downloaded.txt'), strains2get)
+ HCGB_main.printList2file(os.path.join(input_info_dir, 'all_entries.txt'), allstrains_available)
 
  ## save into file
  file_info = os.path.join(input_info_dir, 'info.txt')
 
  ## stop here if dry_run
  if arg_dict.dry_run:
+ print()
+ HCGB_aes.print_sepLine("*", 75, False)
  print ("ATTENTION: Dry run mode selected. Stopping the process here.")
- print("All available entries listed and printed in file: "+ os.path.join(input_info_dir, 'all_entries.txt'))
- print("Subset of entries generated and printed in file: "+ os.path.join(input_info_dir, 'Downloaded.txt'))
- print ("\n\nIf random numbers selected, take into account re-running this process might produce different results.")
+ HCGB_aes.print_sepLine("*", 75, False)
+ print("+ All available entries listed and printed in file:\n\t"+ os.path.join(input_info_dir, 'all_entries.txt'))
+ print("+ Subset of entries generated and printed in file:\n\t"+ os.path.join(input_info_dir, 'Downloaded.txt'))
+ print ("\n\nIf random numbers selected, take into account re-running this process might produce different results.\n")
+ HCGB_aes.print_sepLine("*", 75, False)
+ print()
  exit()
 
  #################

diff --git a/code/BacDup/scripts/taxonomy_retrieval.py b/code/BacDup/scripts/taxonomy_retrieval.py
@@ -131,12 +131,18 @@ def get_GenBank_ids(data_folder, taxID_list, random_k, debug, assembly_level_giv
  ##
  if random_k<0:
  list_entries = list(dict_entries.keys())
- print ('All %s entries selected' %dict_entries_len)
-
+ print ('+ All %s entries selected' %dict_entries_len)
  else:
- print ("Selecting random entries retrieved:")
- list_entries = random.choices(list(dict_entries.keys()), k=random_k)
- print ('%s entries selected out of %s' %(str(random_k), str(dict_entries_len)))
+ if random_k > dict_entries_len:
+ print("+ Sample size desired larger than population.")
+ list_entries = list(dict_entries.keys())
+ print ('\tOnly %s entries selected out of %s specified' %(dict_entries_len, random_k))
+ else:
+ print ("+ Selecting random entries retrieved:")
+ list_entries = random.sample(set(list(dict_entries.keys())), k=random_k)
+ print ('\t%s entries selected out of %s' %(str(random_k), str(dict_entries_len)))
+
+ print() 
 
  ## debug messages
  if debug: