Merge pull request #17 from cilab-ufersa/develop

Develop
cilab-ufersa · Jun 19, 2023 · 516a138 · 516a138
2 parents 4a983ae + f1ec52d
commit 516a138
Show file tree

Hide file tree

Showing 20 changed files with 8,904 additions and 8 deletions.
diff --git a/.gitignore b/.gitignore
@@ -2,6 +2,12 @@
 # Scope is all repo folders.
 *.json
 *.ini 
+*.log
+mlruns/
+catboost_info/
+*.png
+*.eps
+
 
 # Byte-compiled / optimized / DLL files
 __pycache__/

diff --git a/README.md b/README.md
@@ -1,8 +1,8 @@
-# Period Cycle Prediction
+# Predictive Modeling of Menstrual Cycle Length using Artificial Intelligence ⏰
 
 ## About 
 
-Artificial intelligence implementation for better cycle predictions. 
+Time Series Forecasting Approach based on Artificial intelligence implementation for better cycle predictions. 
 
 The period can be uncertain when a woman has irregular cycles. Moreover, the length of the period cycle varies from woman to woman. Therefore, every woman has a particular cycle. AI can help us to understand better about women cycles.
 
@@ -35,3 +35,9 @@ $ pip install -r requirements.txt
 |---- |---- | ----- | ------ |
 | 6 | 30|20XX | Starts  |
 | 7 | 1|20XX | Ends  |
+
+---
+
+## Publications related to this project
+
+Rosana Rego. 2023. [Predictive Modeling of Menstrual Cycle Length: A Time Series Forecasting Approach](https://doi.org/10.21203/rs.3.rs-3050181/v1), PREPRINT (Version 1) available at Research Square.
diff --git a/icon.png b/icon.png
diff --git a/period_cycle_prediction/models/arima.py b/period_cycle_prediction/models/arima.py
@@ -0,0 +1,65 @@
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+from utils.utils import generate_synthetic_data, generate_final_features, split_dataset, convet2dataframe
+from darts import TimeSeries
+from darts.models import AutoARIMA
+from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
+import warnings 
+warnings.filterwarnings("ignore")
+
+if __name__ == '__main__': 
+
+# load the data
+total_regular_cycle_data = pd.read_csv('dataset/total_regular_cycle_data.csv')
+features_total_regular_cycle_data, labels_total_regular_cycle_data = generate_final_features(total_regular_cycle_data)
+input_train_total_regular_cycle, input_test_total_regular_cycle, output_train_total_regular_cycle, output_test_total_regular_cycle = split_dataset(features_total_regular_cycle_data, labels_total_regular_cycle_data, reshape=False)
+
+input_train_total_regular_cycle_df = convet2dataframe(input_train_total_regular_cycle, ['period', 'cycle'])
+output_train_total_regular_cycle = convet2dataframe(output_train_total_regular_cycle, ['period', 'cycle'])
+input_test_total_regular_cycle_df = convet2dataframe(input_test_total_regular_cycle, ['period', 'cycle'])
+series_test = TimeSeries.from_dataframe(input_test_total_regular_cycle_df, 'time', ['period'])
+output_train_series = TimeSeries.from_dataframe(output_train_total_regular_cycle, 'time', ['period'])
+series = TimeSeries.from_dataframe(input_train_total_regular_cycle_df, time_col='time', value_cols=['period'])
+
+# series for cycle prediction
+series_cycle = TimeSeries.from_dataframe(input_train_total_regular_cycle_df, time_col='time', value_cols=['cycle'])
+series_cycle_test = TimeSeries.from_dataframe(input_test_total_regular_cycle_df, time_col='time', value_cols=['cycle'])
+
+# train the model
+model = AutoARIMA()
+model.fit(series)
+
+# make prediction
+prediction_ = model.predict(len(series_test))
+#-----------------------------------#
+# model arima for cycle 
+model_cycle = AutoARIMA()
+model_cycle.fit(series_cycle)
+# prediction the cycle 
+prediction_cycle = model_cycle.predict(3)
+
+testScore = np.sqrt(mean_squared_error(series_test.values(), prediction_.values()))
+print('Test Score: %.2f MSE' % (testScore))
+# calculate mean absolute error
+testScore = mean_absolute_error(series_test.values(), prediction_.values())
+print('Test Score: %.2f MAE' % (testScore))
+# RMSE
+print('Test Score: %.2f RMSE' % np.sqrt(testScore))
+# calculate r2 score
+testScore = r2_score(series_test.values(), prediction_.values())
+print('Test Score: %.2f R2' % (testScore))
+
+plt.figure(figsize=(4, 3))
+plt.plot(np.arange(1,16),series_test.values()[-16:], '-->', linewidth=2.0)
+plt.plot(np.arange(16, 17),prediction.values()[0].astype(int), 'o')
+plt.plot(np.arange(16, 17),prediction.values()[0].astype(int), 'h')
+plt.plot(np.arange(16, 17),prediction.values()[0].astype(int), '*')
+# round the number in axis
+plt.gca().yaxis.set_major_locator(plt.MaxNLocator(integer=True))
+plt.gca().xaxis.set_major_locator(plt.MaxNLocator(integer=True))
+plt.ylabel('Days')
+plt.xlabel('Months')
+plt.legend(['Cycle serie', 'ARIMA', 'LSTM', 'Lasso'], loc='lower left')
+plt.title('Case 1: Predicting the next cycle duration')
+plt.grid(True)
diff --git a/period_cycle_prediction/models/lstm_simulation_case1.py b/period_cycle_prediction/models/lstm_simulation_case1.py
@@ -0,0 +1,55 @@
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+import tensorflow as tf
+from tensorflow.keras.models import Sequential
+from tensorflow.keras.layers import Dense, LSTM, Dropout
+from sklearn.preprocessing import MinMaxScaler
+from sklearn.metrics import mean_squared_error
+from sklearn.metrics import mean_absolute_error
+from sklearn.metrics import r2_score
+from sklearn.metrics import explained_variance_score
+from sklearn.metrics import max_error
+from utils.utils import generate_synthetic_data, generate_final_features, split_dataset
+import warnings 
+warnings.filterwarnings("ignore")
+
+if __name__ == '__main__': 
+
+    total_regular_cycle_data = pd.read_csv('dataset\\total_regular_cycle_data.csv')
+    features_total_regular_cycle_data, labels_total_regular_cycle_data = generate_final_features(total_regular_cycle_data)
+    input_train_total_regular_cycle, input_test_total_regular_cycle, output_train_total_regular_cycle, output_test_total_regular_cycle = split_dataset(features_total_regular_cycle_data, labels_total_regular_cycle_data, reshape=False)
+
+    # create and fit the LSTM network
+    n_features = input_train_total_regular_cycle.shape[2]
+    model = Sequential()
+    model.add(LSTM(64, input_shape=(3, n_features),  activation='tanh'))
+    model.add(Dropout(0.05))
+    model.add(Dense(n_features, activation='relu'))
+    model.summary()
+
+    opt=tf.keras.optimizers.Adam(learning_rate=0.01)
+    model.compile(loss='mean_squared_error', optimizer=opt, run_eagerly=True)
+    # add early stopping
+    early_stopping = tf.keras.callbacks.EarlyStopping( monitor='val_loss', patience=5, restore_best_weights=True)
+
+    history = model.fit(input_train_total_regular_cycle, output_train_total_regular_cycle, epochs=100, batch_size=32, validation_split=0.2, verbose=1, callbacks=[early_stopping])
+
+    # plot the loss and val loss
+    plt.figure(figsize=(4, 3))
+    plt.plot(history.history['loss'], '-', linewidth=2)
+    plt.plot(history.history['val_loss'], '--', linewidth=2)
+    plt.grid(True)
+    plt.legend(['Train', 'Validation'])
+    plt.ylabel('Loss')
+    plt.xlabel('Epoch')
+    plt.title('Case 1: LSTM model loss')
+    ax = plt.axes([0.6, 0.4, .20, .20])
+    plt.plot(history.history['loss'], '-', linewidth=2)
+    plt.plot(history.history['val_loss'], '--', linewidth=2)
+    plt.grid(True)
+    ax.set_ylim(0.1, 3)
+    ax.set_xlim(70, 93)
+    # save history
+    history_df = pd.DataFrame(history.history)
+    history_df.to_csv('case1_history_lstm.csv', index=False)
diff --git a/period_cycle_prediction/models/lstm_simulation_case2.py b/period_cycle_prediction/models/lstm_simulation_case2.py
@@ -0,0 +1,70 @@
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+import tensorflow as tf
+from tensorflow.keras.models import Sequential
+from tensorflow.keras.layers import Dense, LSTM, Dropout
+from sklearn.preprocessing import MinMaxScaler
+from sklearn.metrics import mean_squared_error
+from sklearn.metrics import mean_absolute_error
+from sklearn.metrics import r2_score
+from sklearn.metrics import explained_variance_score
+from sklearn.metrics import max_error
+from utils.utils import generate_synthetic_data, generate_final_features, split_dataset
+import warnings 
+warnings.filterwarnings("ignore")
+
+if __name__ == '__main__': 
+
+    regular_cycle_data = pd.read_csv('dataset/regular_cycle_data.csv')
+    features_regular_cycle_data, labels_regular_cycle_data = generate_final_features(regular_cycle_data)
+    input_train_regular_cycle, input_test_regular_cycle, output_train_regular_cycle, output_test_regular_cycle = split_dataset(features_regular_cycle_data, labels_regular_cycle_data, reshape=False)
+
+    n_features = input_train_regular_cycle.shape[2]
+    model = Sequential()
+    model.add(LSTM(units=128, return_sequences=True, input_shape=(input_train_regular_cycle.shape[1], input_train_regular_cycle.shape[2])))
+    model.add(Dropout(0.2))
+    model.add(LSTM(units=64, return_sequences=True))
+    model.add(Dropout(0.2))
+    model.add(LSTM(units=32))
+    model.add(Dropout(0.2))
+    model.add(Dense(units=n_features, activation='relu'))
+
+    opt=tf.keras.optimizers.Adam()
+    model.compile(loss='mean_squared_error', optimizer=opt)
+    # add early stopping
+    early_stopping = tf.keras.callbacks.EarlyStopping( monitor='val_loss', patience=10, restore_best_weights=True)
+
+    history = model.fit(input_train_regular_cycle, output_train_regular_cycle, epochs=2000, batch_size=32, validation_split=0.2, callbacks=[early_stopping])
+
+    # plot the loss and val loss
+    plt.figure(figsize=(4, 3))
+    plt.plot(history.history['loss'], '-', linewidth=2)
+    plt.plot(history.history['val_loss'], '--', linewidth=2)
+    plt.grid(True)
+    plt.legend(['Train', 'Validation'])
+    plt.ylabel('Loss')
+    plt.xlabel('Epoch')
+    plt.title('Case 2: LSTM model loss')
+    # log scale
+    #plt.yscale('log')
+
+    # add a zoom in epoch 70 to 100
+    ax = plt.axes([0.6, 0.4, .20, .20])
+    plt.plot(history.history['loss'], '-', linewidth=2)
+    plt.plot(history.history['val_loss'], '--', linewidth=2)
+    plt.grid(True)
+    ax.set_ylim(1, 6)
+    ax.set_xlim(1500, 1650)
+
+
+    # save history
+    history_df = pd.DataFrame(history.history)
+    history_df.to_csv('case2_history_lstm.csv', index=False)
+
+    # save figure
+    fig = plt.gcf()
+    fig.savefig('case2_loss_lstm.eps', dpi=300, bbox_inches='tight')
+
+    # save model 
+    model.save('case2_lstm_model.h5')