Skip to content

Commit

Permalink
first commit
Browse files Browse the repository at this point in the history
  • Loading branch information
xmuyzz committed Feb 5, 2022
1 parent 92974d0 commit 85d0cb4
Show file tree
Hide file tree
Showing 141 changed files with 6,980 additions and 0 deletions.
Binary file added __pycache__/yaml.cpython-38.pyc
Binary file not shown.
Binary file added get_data/.DS_Store
Binary file not shown.
Empty file added get_data/__ini__.py
Empty file.
Binary file added get_data/__pycache__/data_gen_flow.cpython-38.pyc
Binary file not shown.
Binary file not shown.
Binary file added get_data/__pycache__/exval_dataset.cpython-38.pyc
Binary file not shown.
Binary file added get_data/__pycache__/get_data_arr.cpython-38.pyc
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file added get_data/__pycache__/pred_dataset.cpython-38.pyc
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file added get_data/__pycache__/test_dataset.cpython-38.pyc
Binary file not shown.
Binary file added get_data/__pycache__/train_dataset.cpython-38.pyc
Binary file not shown.
Binary file not shown.
Binary file added get_data/__pycache__/val_dataset.cpython-38.pyc
Binary file not shown.
157 changes: 157 additions & 0 deletions get_data/data_gen_flow.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,157 @@

import os
import numpy as np
import pandas as pd
import seaborn as sn
import matplotlib.pyplot as plt
from PIL import Image
import glob
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.image import ImageDataGenerator



def train_generator(proj_dir, batch_size, input_channel=3):

"""
create data generator for training dataset;
Arguments:
out_dir {path} -- path to output results;
batch_size {int} -- batch size for data generator;
input_channel {int} -- input channel for image;
Return:
Keras data generator;
"""

pro_data_dir = os.path.join(proj_dir, 'pro_data')
if not os.path.exists(pro_data_dir):
os.mkdir(pro_data_dir)

### load train data based on input channels
if input_channel == 1:
fn = 'train_arr_1ch.npy'
elif input_channel == 3:
#fn = 'train_arr_3ch_crop.npy'
fn = 'train_arr_3ch.npy'
x_train = np.load(os.path.join(pro_data_dir, fn))

### load val labels
train_df = pd.read_csv(os.path.join(pro_data_dir, 'train_img_df.csv'))
y_train = np.asarray(train_df['label']).astype('int').reshape((-1, 1))

## data generator
datagen = ImageDataGenerator(
featurewise_center=False,
samplewise_center=False,
featurewise_std_normalization=False,
samplewise_std_normalization=False,
zca_whitening=False,
zca_epsilon=1e-06,
rotation_range=5,
width_shift_range=0.1,
height_shift_range=0.1,
brightness_range=None,
shear_range=0.0,
zoom_range=0,
channel_shift_range=0.0,
fill_mode="nearest",
cval=0.0,
horizontal_flip=False,
vertical_flip=False,
rescale=None,
preprocessing_function=None,
data_format=None,
validation_split=0.0,
dtype=None,
)

### Train generator
train_gen = datagen.flow(
x=x_train,
y=y_train,
subset=None,
batch_size=batch_size,
seed=42,
shuffle=True,
)
print('Train generator created')

return train_gen




def val_generator(proj_dir, batch_size, input_channel=3):

"""
create data generator for validation dataset;
Arguments:
out_dir {path} -- path to output results;
batch_size {int} -- batch size for data generator;
input_channel {int} -- input channel for image;
Return:
Keras data generator;
"""

pro_data_dir = os.path.join(proj_dir, 'pro_data')
if not os.path.exists(pro_data_dir):
os.mkdir(pro_data_dir)

### load val data based on input channels
if input_channel == 1:
fn = 'val_arr_1ch.npy'
elif input_channel == 3:
fn = 'val_arr_3ch.npy'
x_val = np.load(os.path.join(pro_data_dir, fn))

### load val labels
val_df = pd.read_csv(os.path.join(pro_data_dir, 'val_img_df.csv'))
y_val = np.asarray(val_df['label']).astype('int').reshape((-1, 1))

datagen = ImageDataGenerator(
featurewise_center=False,
samplewise_center=False,
featurewise_std_normalization=False,
samplewise_std_normalization=False,
zca_whitening=False,
zca_epsilon=1e-06,
rotation_range=0,
width_shift_range=0.0,
height_shift_range=0.0,
brightness_range=None,
shear_range=0.0,
zoom_range=0,
channel_shift_range=0.0,
fill_mode="nearest",
cval=0.0,
horizontal_flip=False,
vertical_flip=False,
rescale=None,
preprocessing_function=None,
data_format=None,
validation_split=0.0,
dtype=None,
)

datagen = ImageDataGenerator()
val_gen = datagen.flow(
x=x_val,
y=y_val,
subset=None,
batch_size=batch_size,
seed=42,
shuffle=True,
)
print('val generator created')

return x_val, y_val, val_gen



214 changes: 214 additions & 0 deletions get_data/exval_dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,214 @@
import glob
import shutil
import os
import pandas as pd
import numpy as np
import nrrd
import re
import matplotlib
import matplotlib.pyplot as plt
import pickle
from time import gmtime, strftime
from datetime import datetime
import timeit
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from utils.resize_3d import resize_3d
from utils.crop_image import crop_image
from utils.respacing import respacing
from utils.nrrd_reg import nrrd_reg_rigid_ref
from get_data.get_img_dataset import img_dataset



def exval_pat_dataset(out_dir, proj_dir, crop_shape=[192, 192, 140],
interp_type='linear', input_channel=3,
norm_type='np_clip', data_exclude=None, new_spacing=[1, 1, 3]):

"""
Preprocess data (respacing, registration, cropping) for chest CT dataset;
Arguments:
proj_dir {path} -- path to main project folder;
out_dir {path} -- path to result outputs;
Keyword arguments:
new_spacing {tuple} -- respacing size, defaul [1, 1, 3];
return_type {str} -- image data format after preprocessing, default: 'nrrd';
data_exclude {str} -- exclude patient data due to data issue, default: None;
crop_shape {np.array} -- numpy array size afer cropping;
interp_type {str} -- interpolation type for respacing, default: 'linear';
Return:
save nrrd image data;
"""

NSCLC_data_dir = '/mnt/aertslab/DATA/Lung/TOPCODER/nrrd_data'
NSCLC_reg_dir = os.path.join(out_dir, 'data/NSCLC_data_reg')
exval1_dir = os.path.join(out_dir, 'exval1')
pro_data_dir = os.path.join(proj_dir, 'pro_data')

if not os.path.exists(NSCLC_reg_dir):
os.mkdir(NSCLC_reg_dir)
if not os.path.exists(exval1_dir):
os.mkdir(exval1_dir)
if not os.path.exists(pro_data_dir):
os.mkdir(pro_data_dir)

reg_temp_img = os.path.join(exval1_dir, 'NSCLC001.nrrd')

df_label = pd.read_csv(os.path.join(pro_data_dir, 'label_NSCLC.csv'))
df_label.dropna(subset=['ctdose_contrast', 'top_coder_id'], how='any', inplace=True)
df_id = pd.read_csv(os.path.join(pro_data_dir, 'harvard_rt.csv'))

## create df for dir, ID and labels on patient level
fns = []
IDs = []
labels = []
list_fn = [fn for fn in sorted(glob.glob(NSCLC_data_dir + '/*nrrd'))]
for fn in list_fn:
ID = fn.split('/')[-1].split('_')[2][0:5].strip()
for label, top_coder_id in zip(df_label['ctdose_contrast'], df_label['top_coder_id']):
tc_id = top_coder_id.split('_')[2].strip()
if tc_id == ID:
IDs.append(ID)
labels.append(label)
fns.append(fn)
## exclude scans with certain conditions
print('ID:', len(IDs))
print('file:', len(fns))
print('label:', len(labels))
print('contrast scan in ex val:', labels.count(1))
print('non-contrast scan in ex val:', labels.count(0))
df = pd.DataFrame({'ID': IDs, 'file': fns, 'label': labels})
df.to_csv(os.path.join(pro_data_dir, 'exval_pat_df.csv'))
print('total test scan:', df.shape[0])

## delete excluded scans and repeated scans
if data_exclude != None:
df_exclude = df[df['ID'].isin(data_exclude)]
print('exclude scans:', df_exclude)
df.drop(df[df['ID'].isin(test_exclude)].index, inplace=True)
print('total test scans:', df.shape[0])
pd.options.display.max_columns = 100
pd.set_option('display.max_rows', 500)
#print(df[0:50])

### registration, respacing, cropping
for fn, ID in zip(df['file'], df['ID']):
print(ID)
## respacing
img_nrrd = respacing(
nrrd_dir=fn,
interp_type=interp_type,
new_spacing=new_spacing,
patient_id=ID,
return_type='nrrd',
save_dir=None
)
## registration
img_reg = nrrd_reg_rigid_ref(
img_nrrd=img_nrrd,
fixed_img_dir=reg_temp_img,
patient_id=ID,
save_dir=None
)
## crop image from (500, 500, 116) to (180, 180, 60)
img_crop = crop_image(
nrrd_file=img_reg,
patient_id=ID,
crop_shape=crop_shape,
return_type='nrrd',
save_dir=NSCLC_reg_dir
)


def exval_img_dataset(proj_dir, slice_range=range(50, 120), input_channel=3,
norm_type='np_clip', split=True, fn_arr_1ch=None):

"""
get stacked image slices from scan level CT and corresponding labels and IDs;
Args:
run_type {str} -- train, val, test, external val, pred;
pro_data_dir {path} -- path to processed data;
nrrds {list} -- list of paths for CT scan files in nrrd format;
IDs {list} -- list of patient ID;
labels {list} -- list of patient labels;
slice_range {np.array} -- image slice range in z direction for cropping;
run_type {str} -- train, val, test, or external val;
pro_data_dir {path} -- path to processed data;
fn_arr_1ch {str} -- filename for 1 d numpy array for stacked image slices;
fn_arr_3ch {str} -- filename for 3 d numpy array for stacked image slices;
fn_df {str} -- filename for dataframe contains image path, image labels and image ID;
Keyword args:
input_channel {str} -- image channel, default: 3;
norm_type {str} -- image normalization type: 'np_clip' or 'np_linear';
Returns:
img_df {pd.df} -- dataframe contains preprocessed image paths, label, ID (image level);
"""

pro_data_dir = os.path.join(proj_dir, 'pro_data')
df = pd.read_csv(os.path.join(pro_data_dir, 'exval_pat_df.csv'))
fns = df['file']
labels = df['label']
IDs = df['ID']

## split dataset for fine-tuning model and test model
if split == True:
data_exval1, data_exval2, label_exval1, label_exval2, ID_exval1, ID_exval2 = train_test_split(
fns,
labels,
IDs,
stratify=labels,
shuffle=True,
test_size=0.2,
random_state=42
)
nrrds = [data_exval1, data_exval2]
labels = [label_exval1, label_exval2]
IDs = [ID_exval1, ID_exval2]
fn_arrs = ['exval1_arr1.npy', 'exval1_arr2.npy']
fn_dfs = ['exval1_img_df1.csv', 'exval1_img_df2.csv']

## creat numpy array for image slices
for nrrd, label, ID, fn_arr, fn_df in zip(nrrds, labels, IDs, fn_arrs, fn_dfs):
img_dataset(
pro_data_dir=pro_data_dir,
run_type='exval',
nrrds=nrrds,
IDs=IDs,
labels=labels,
fn_arr_1ch=None,
fn_arr_3ch=fn_arr_3ch,
fn_df=fn_df,
slice_range=slice_range,
input_channel=3,
norm_type=norm_type,
)
print('train and test datasets created!')

## use entire exval data to test model
elif split == False:
nrrds = fns
labels = labels
IDs = IDs
img_dataset(
pro_data_dir=pro_data_dir,
run_type='exval',
nrrds=nrrds,
IDs=IDs,
labels=labels,
fn_arr_1ch=None,
fn_arr_3ch='exval1_arr.npy',
fn_df='exval1_img_df.csv',
slice_range=slice_range,
input_channel=3,
norm_type=norm_type,
)
print('total patient:', len(IDs))
print('exval datasets created!')

Loading

0 comments on commit 85d0cb4

Please sign in to comment.