Skip to content

Commit

Permalink
update code
Browse files Browse the repository at this point in the history
  • Loading branch information
JFsanchezherrero committed Jun 10, 2021
1 parent 6a0febb commit 3358175
Show file tree
Hide file tree
Showing 3 changed files with 47 additions and 26 deletions.
69 changes: 45 additions & 24 deletions code/BacDup/modules/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
## Jose F. Sanchez & Alba Moya ##
## Copyright (C) 2020-2021 ##
##############################################################
from pandas._libs.missing import NA
'''
Created on 13 dic. 2020
@author: alba
Expand All @@ -22,7 +23,9 @@

import HCGB
from HCGB.functions.aesthetics_functions import debug_message
import HCGB.functions.time_functions as time_functions
import HCGB.functions.aesthetics_functions as HCGB_aes
import HCGB.functions.time_functions as HCGB_time
import HCGB.functions.main_functions as HCGB_main
from HCGB import sampleParser

import BacDup.modules.info as info
Expand Down Expand Up @@ -60,9 +63,9 @@ def run_search(arg_dict):

### Start the analysis
BacDup_functions.pipeline_header('BacDup')
HCGB.functions.aesthetics_functions.boxymcboxface("Search module")
HCGB_aes.boxymcboxface("Search module")
print ("--------- Starting Process ---------")
time_functions.print_time()
HCGB_time.print_time()

## init time
start_time_total = time.time()
Expand Down Expand Up @@ -90,7 +93,7 @@ def run_search(arg_dict):

## get files
print ()
HCGB.functions.aesthetics_functions.print_sepLine("-",50, False)
HCGB_aes.print_sepLine("-",50, False)
print ('+ Getting information provided... ')
print ('+ Several options available:')
print ('\t* BacDup project folder with initiated data')
Expand All @@ -111,11 +114,17 @@ def run_search(arg_dict):
pd_samples_retrieved = parse_search_options(arg_dict)

## time stamp
start_time_partial = time_functions.timestamp(start_time_total)
start_time_partial = HCGB_time.timestamp(start_time_total)

## for each sample
dict_search_folders = HCGB.functions.files_functions.outdir_project(outdir, arg_dict.project, pd_samples_retrieved, "search", arg_dict.debug)
dict_dup_folders = HCGB.functions.files_functions.outdir_project(outdir, arg_dict.project, pd_samples_retrieved, "dups", arg_dict.debug)
dict_search_folders = HCGB.functions.files_functions.outdir_project(outdir, arg_dict.project,
pd_samples_retrieved, "search", arg_dict.debug)

dict_dup_folders = HCGB.functions.files_functions.outdir_project(outdir, arg_dict.project,
pd_samples_retrieved, "dups", arg_dict.debug)

dict_parse_folders = HCGB.functions.files_functions.outdir_project(outdir, arg_dict.project,
pd_samples_retrieved, "parse", arg_dict.debug)

## create results
data2add = pd.DataFrame(columns=BacDup_functions.columns_dup_table())
Expand All @@ -132,7 +141,10 @@ def run_search(arg_dict):
## get results
file_data = pd_samples_retrieved.loc[sample, 'file_data']
format = pd_samples_retrieved.loc[sample, 'format']
filtered_data = dup_searcher.filter_data(sample, file_data, format, arg_dict.pident, arg_dict.evalue, arg_dict.percentage, arg_dict.bitscore, folder, arg_dict.debug)
filtered_data = dup_searcher.filter_data(sample, file_data, format,
arg_dict.pident, arg_dict.evalue,
arg_dict.percentage, arg_dict.bitscore,
folder, arg_dict.debug)

## timestamps
filter_timestamp = os.path.join(dict_dup_folders[sample], '.filter_success')
Expand All @@ -142,13 +154,14 @@ def run_search(arg_dict):
filtered_data.to_csv(sort_csv, header=True, index=False)

## print time stamp
time_functions.print_time_stamp(filter_timestamp)
HCGB_time.print_time_stamp(filter_timestamp)
else:
read_time = time_functions.read_time_stamp(filter_timestamp)
read_time = HCGB_time.read_time_stamp(filter_timestamp)
print (colored("\t+ Filter results already available for sample %s [%s]" %(sample, read_time), 'green'))

## get annotation
(dup_annot_df, data2add_entry) = dup_searcher.get_dupannot(sample, filtered_data, annot_table_file, arg_dict.debug)
(dup_annot_df, data2add_entry) = dup_searcher.get_dupannot(sample, filtered_data,
annot_table_file, arg_dict.debug)

##
info_dup_file = os.path.join(dict_dup_folders[sample], 'info_dup.csv')
Expand All @@ -158,23 +171,31 @@ def run_search(arg_dict):
dup_annot_df.to_csv(dup_annot_file, header=True)

## print time stamp
time_functions.print_time_stamp(annot_timestamp)
HCGB_time.print_time_stamp(annot_timestamp)

else:
read_time = time_functions.read_time_stamp(annot_timestamp)
read_time = HCGB_time.read_time_stamp(annot_timestamp)
print (colored("\t+ Duplicate annotation already available for sample %s [%s]" %(sample, read_time), 'green'))

## add info for each
dup_annot_df = HCGB.functions.main_functions.get_data(dup_annot_file, ',', "index_col=0")
annot_table = HCGB.functions.main_functions.get_data(annot_table_file, ',', "index_col=0")
dup_annot_df = HCGB_main.get_data(dup_annot_file, ',', "index_col=0")
annot_table = HCGB_main.get_data(annot_table_file, ',', "index_col=0")
data2add_entry = dup_searcher.get_dup_stats(sample, dup_annot_df, annot_table, arg_dict.debug)


## add genome length data
data2add_entry['genome_len'] = ''
len_df_file = os.path.join(dict_parse_folders[sample], 'length_df.csv')
if os.path.isfile(len_df_file):
len_data = HCGB_main.get_data(len_df_file, ',', "header=None")
data2add_entry['genome_len'] = len_data[1].sum()

## merge data
#data2add_entry = data2add_entry.reset_index()
data2add = data2add.append(data2add_entry, ignore_index=False)

### report generation
HCGB.functions.aesthetics_functions.boxymcboxface("Summarizing duplicated search")
HCGB_aes.boxymcboxface("Summarizing duplicated search")
outdir_report = HCGB.functions.files_functions.create_subfolder("report", outdir)
dups_report = HCGB.functions.files_functions.create_subfolder("dups", outdir_report)

Expand All @@ -184,7 +205,7 @@ def run_search(arg_dict):
## maybe add a summary of the files?

print ("\n*************** Finish *******************")
start_time_partial = time_functions.timestamp(start_time_total)
start_time_partial = HCGB_time.timestamp(start_time_total)

print ("+ Exiting search module.")
return()
Expand Down Expand Up @@ -227,13 +248,13 @@ def parse_search_options(arg_dict):
## debug messages
if (arg_dict.debug):
debug_message('pd_proteins:', 'yellow')
HCGB.functions.main_functions.print_all_pandaDF(pd_proteins)
HCGB_main.print_all_pandaDF(pd_proteins)

debug_message('pd_annot:', 'yellow')
HCGB.functions.main_functions.print_all_pandaDF(pd_annot)
HCGB_main.print_all_pandaDF(pd_annot)

debug_message('pd_samples_retrieved:', 'yellow')
HCGB.functions.main_functions.print_all_pandaDF(pd_samples_retrieved)
HCGB_main.print_all_pandaDF(pd_samples_retrieved)

## --------------------------------------- ##
## data on multiple sources
Expand Down Expand Up @@ -265,11 +286,11 @@ def parse_search_options(arg_dict):
BacDup_functions.file_readable_check(arg_dict.text_file)

print (colored('\t* Multiple BLAST results files provided .......[OK]', 'green'))
dict_entries = HCGB.functions.main_functions.file2dictionary(arg_dict.text_file, ',')
dict_entries = HCGB_main.file2dictionary(arg_dict.text_file, ',')

## check file is readable
BacDup_functions.file_readable_check(arg_dict.annot_table)
dict_entries_annot = HCGB.functions.main_functions.file2dictionary(arg_dict.annot_table, ',')
dict_entries_annot = HCGB_main.file2dictionary(arg_dict.annot_table, ',')

## Check dictionaries contain same information
if (dict_entries.keys() == dict_entries_annot.keys()):
Expand Down Expand Up @@ -349,12 +370,12 @@ def parse_search_options(arg_dict):

## check if ok
BacDup_functions.file_readable_check(arg_dict.fasta_prot)
dict_entries = HCGB.functions.main_functions.file2dictionary(arg_dict.fasta_prot, ',')
dict_entries = HCGB_main.file2dictionary(arg_dict.fasta_prot, ',')

## check file is readable
BacDup_functions.file_readable_check(arg_dict.annot_table)
print (colored('\t* Multiple annotation tables provided .......[OK]', 'green'))
dict_entries_annot = HCGB.functions.main_functions.file2dictionary(arg_dict.annot_table, ',')
dict_entries_annot = HCGB_main.file2dictionary(arg_dict.annot_table, ',')

## Check dictionaries contain right information
if (dict_entries.keys() == dict_entries_annot.keys()):
Expand Down
2 changes: 1 addition & 1 deletion code/BacDup/scripts/dup_searcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ def get_dup_stats(sample, dup_annot_df, annot_table, debug):

## convert to strings to save
list_entries = [str(x) for x in list_entries]
mean = "{0:.3g} ".format(d[2])
mean = "{0:.3g}".format(d[2])
counts_dups = len(df_dup[1]["dup_id"])
n_tot = d[0]
n_min = d[1][0]
Expand Down
2 changes: 1 addition & 1 deletion code/BacDup/scripts/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

################################################################################
def columns_dup_table():
columns = ["total_prots",
columns = ["total_prots", "genome_len",
"n_groups_all", "n_dups_all",
"n_groups_pseudoFree", "n_dups_pseudoFree",
"n_groups_pseudoMobFree", "n_dups_pseudoMobFree",
Expand Down

0 comments on commit 3358175

Please sign in to comment.