Skip to content

Commit

Permalink
Merge pull request #421 from clemente-lab/IssueFix-consistent-validation
Browse files Browse the repository at this point in the history
Issue fix consistent validation
  • Loading branch information
adamcantor22 authored Jul 22, 2022
2 parents 02b1a87 + dff9c71 commit c2796ee
Show file tree
Hide file tree
Showing 9 changed files with 249 additions and 228 deletions.
9 changes: 6 additions & 3 deletions mmeds/summary.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,8 +130,10 @@ def summarize_qiime2(path, files, config, study_name, testing=False):
summary_files = defaultdict(list)

# Get Table Stats
cmd = f"qiime tools export --input-path {str(files['stats_table'])} --output-path {str(path / 'temp')}"
run(cmd, env=new_env, check=True, shell=True)
for key in files:
if 'stats_table' in key:
cmd = f"qiime tools export --input-path {str(files[key])} --output-path {str(path / 'temp')}"
run(cmd, env=new_env, check=True, shell=True)
table_stat_files = (path / 'temp').glob("stats.tsv")
for table_file in table_stat_files:
copy(table_file, files['summary'])
Expand Down Expand Up @@ -542,7 +544,8 @@ def write_notebook(self, nn, testing=False):
"""
try:
jupyter_env = setup_environment('jupyter')
latex_env = setup_environment('latex')
if testing:
latex_env = setup_environment('latex')

nbf.write(nn, str(self.path / '{}.ipynb'.format(self.name)))

Expand Down
9 changes: 5 additions & 4 deletions mmeds/tests/server/test_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,8 @@ def test_ca_animal_upload(self):
def test_cb_upload(self):
Logger.info('cb upload')
self.login()
self.upload_sequencing_run()
self.upload_sequencing_run('TEST_RUN')
self.upload_sequencing_run('TEST_RUN_ALT')
self.upload_metadata_fail()
# TODO: Repair this test. This function is working in production and when testing locally,
# but github actions gets an 'InvalidSQL' error here
Expand Down Expand Up @@ -673,8 +674,8 @@ def upload_metadata(self):
self.assertBody(page)
Logger.debug('Checked a metadata file with no problems')

def upload_sequencing_run(self):
self.getPage('/upload/upload_sequencing_run?barcodes_type=single&run_name=TEST_RUN', self.cookies)
def upload_sequencing_run(self, run_name):
self.getPage(f'/upload/upload_sequencing_run?barcodes_type=single&run_name={run_name}', self.cookies)
self.assertStatus('200 OK')
headers, body = self.upload_files(['for_reads', 'rev_reads', 'barcodes', 'reads_type'],
[fig.TEST_READS, fig.TEST_REV_READS, fig.TEST_BARCODES, 'paired_end'],
Expand All @@ -684,7 +685,7 @@ def upload_sequencing_run(self):
self.assertStatus('200 OK')
sleep(5)
mail = receive_email(self.server_user, 'upload-run',
'user {} uploaded data for the {}'.format(self.server_user, 'TEST_RUN'))
'user {} uploaded data for the {}'.format(self.server_user, run_name))
self.access_code = mail.split('access code:')[1].splitlines()[1]

def modify_upload(self):
Expand Down
9 changes: 6 additions & 3 deletions mmeds/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -378,7 +378,7 @@ def get_valid_columns(metadata_file, option, ignore_bad_cols=False):
# Ensure there aren't any invalid columns specified to be included in the analysis
try:
# If 'all' only select columns that don't have all the same or all unique values
if df[col].isnull().all() or df[col].nunique() == 1 or df[col].nunique() == len(df[col]):
if df[col].isnull().all() or df[col].nunique() == 1:
if col in ['Together', 'Separate']:
summary_cols.append(col)
col_types[col] = False
Expand All @@ -389,8 +389,11 @@ def get_valid_columns(metadata_file, option, ignore_bad_cols=False):
# If the columns is explicitly specified only check that it exists in the metadata
else:
assert df[col].any()
summary_cols.append(col)
col_types[col] = pd.api.types.is_numeric_dtype(df[col])
col_type = pd.api.types.is_numeric_dtype(df[col])
# Continue if metadata is continuous or, if categorical, not all unique vals
if col_type or not df[col].nunique() == len(df[col]):
summary_cols.append(col)
col_types[col] = col_type
except KeyError:
if not ignore_bad_cols:
raise InvalidConfigError('Invalid metadata column {} in config file'.format(col))
Expand Down
44 changes: 29 additions & 15 deletions mmeds/validate.py
Original file line number Diff line number Diff line change
Expand Up @@ -219,24 +219,37 @@ def check_NA(self, column):
if pd.isna(value):
self.errors.append(err.format(row=i, col=self.col_index))

def check_duplicates(self, column, column2=None):
def check_duplicates(self, column, runs=None, column2=None):
""" Checks for any duplicate entries in the provided column(s) """
cells = defaultdict(list)
# Concatenate dual barcodes
if column2 is not None:
column = [str(c1)+str(c2) for c1, c2 in zip(column, column2)]
if runs is None:
runs = ["run" for i in range(len(column))]

# Create dictionary of which indecies should be checked per-run
ranges = {}
for i, cell in enumerate(runs):
if cell not in ranges:
ranges[cell] = [i]
else:
ranges[cell].append(i)

# Add the indices of each item
for i, cell in enumerate(column):
cells[cell].append(i)
# Find any duplicates
dups = {k: v for k, v in cells.items() if len(v) > 1}
err_str = '{}\t{}\tDuplicate Value Error: Duplicate value {} of row {} in row {} in column {}.'
for dup_key in dups.keys():
value = dups[dup_key]
for val in value[1:]:
if not pd.isnull(dup_key):
self.errors.append(err_str.format(val, self.col_index, dup_key, value[0], val, self.cur_col))
# Check duplicates per-sequencing run
for run in ranges:
cells = defaultdict(list)
# Add the indices of each item
for i, cell in enumerate(column):
if i in ranges[run]:
cells[cell].append(i)
# Find any duplicates
dups = {k: v for k, v in cells.items() if len(v) > 1}
err_str = '{}\t{}\tDuplicate Value Error: Duplicate value {} of row {} in row {} in column {}.'
for dup_key in dups.keys():
value = dups[dup_key]
for val in value[1:]:
if not pd.isnull(dup_key):
self.errors.append(err_str.format(val, self.col_index, dup_key, value[0], val, self.cur_col))

def check_sequencing_runs(self, column):
""" Check that the sequening runs exist """
Expand Down Expand Up @@ -391,9 +404,10 @@ def check_table_column(self):
if self.cur_table == 'RawData':
if self.cur_col == 'BarcodeSequence':
if self.barcodes_type == 'single':
self.check_duplicates(col)
self.check_duplicates(col, self.df['RawDataProtocol']['RawDataProtocolID'])
elif self.barcodes_type == 'dual':
self.check_duplicates(col, self.df['AdditionalMetaData']['BarcodeSequenceR'])
self.check_duplicates(col, self.df['RawDataProtocol']['RawDataProtocolID'],
self.df['AdditionalMetaData']['BarcodeSequenceR'])
self.check_lengths(col)
self.check_barcode_chars(col)
self.check_NA(col)
Expand Down
Loading

0 comments on commit c2796ee

Please sign in to comment.