Merge pull request #16 from cilab-ufersa/feature/new

Feature/new
cilab-ufersa · Jun 19, 2023 · f1ec52d · f1ec52d
2 parents 08d7885 + 690ea5a
commit f1ec52d
Show file tree

Hide file tree

Showing 19 changed files with 8,904 additions and 8 deletions.
diff --git a/.gitignore b/.gitignore
@@ -2,6 +2,12 @@
 # Scope is all repo folders.
 *.json
 *.ini 
+*.log
+mlruns/
+catboost_info/
+*.png
+*.eps
+
 
 # Byte-compiled / optimized / DLL files
 __pycache__/

diff --git a/README.md b/README.md
@@ -1,8 +1,8 @@
-# Period Cycle Prediction
+# Predictive Modeling of Menstrual Cycle Length using Artificial Intelligence ⏰
 
 ## About 
 
-Artificial intelligence implementation for better cycle predictions. 
+Time Series Forecasting Approach based on Artificial intelligence implementation for better cycle predictions. 
 
 The period can be uncertain when a woman has irregular cycles. Moreover, the length of the period cycle varies from woman to woman. Therefore, every woman has a particular cycle. AI can help us to understand better about women cycles.
 
@@ -35,3 +35,9 @@ $ pip install -r requirements.txt
 |---- |---- | ----- | ------ |
 | 6 | 30|20XX | Starts |
 | 7 | 1|20XX | Ends |
+
+---
+
+## Publications related to this project
+
+Rosana Rego. 2023. [Predictive Modeling of Menstrual Cycle Length: A Time Series Forecasting Approach](https://doi.org/10.21203/rs.3.rs-3050181/v1), PREPRINT (Version 1) available at Research Square.
diff --git a/period_cycle_prediction/models/arima.py b/period_cycle_prediction/models/arima.py
@@ -0,0 +1,65 @@
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+from utils.utils import generate_synthetic_data, generate_final_features, split_dataset, convet2dataframe
+from darts import TimeSeries
+from darts.models import AutoARIMA
+from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
+import warnings 
+warnings.filterwarnings("ignore")
+
+if __name__ == '__main__': 
+
+# load the data
+total_regular_cycle_data = pd.read_csv('dataset/total_regular_cycle_data.csv')
+features_total_regular_cycle_data, labels_total_regular_cycle_data = generate_final_features(total_regular_cycle_data)
+input_train_total_regular_cycle, input_test_total_regular_cycle, output_train_total_regular_cycle, output_test_total_regular_cycle = split_dataset(features_total_regular_cycle_data, labels_total_regular_cycle_data, reshape=False)
+
+input_train_total_regular_cycle_df = convet2dataframe(input_train_total_regular_cycle, ['period', 'cycle'])
+output_train_total_regular_cycle = convet2dataframe(output_train_total_regular_cycle, ['period', 'cycle'])
+input_test_total_regular_cycle_df = convet2dataframe(input_test_total_regular_cycle, ['period', 'cycle'])
+series_test = TimeSeries.from_dataframe(input_test_total_regular_cycle_df, 'time', ['period'])
+output_train_series = TimeSeries.from_dataframe(output_train_total_regular_cycle, 'time', ['period'])
+series = TimeSeries.from_dataframe(input_train_total_regular_cycle_df, time_col='time', value_cols=['period'])
+
+# series for cycle prediction
+series_cycle = TimeSeries.from_dataframe(input_train_total_regular_cycle_df, time_col='time', value_cols=['cycle'])
+series_cycle_test = TimeSeries.from_dataframe(input_test_total_regular_cycle_df, time_col='time', value_cols=['cycle'])
+
+# train the model
+model = AutoARIMA()
+model.fit(series)
+
+# make prediction
+prediction_ = model.predict(len(series_test))
+#-----------------------------------#
+# model arima for cycle 
+model_cycle = AutoARIMA()
+model_cycle.fit(series_cycle)
+# prediction the cycle 
+prediction_cycle = model_cycle.predict(3)
+
+testScore = np.sqrt(mean_squared_error(series_test.values(), prediction_.values()))
+print('Test Score: %.2f MSE' % (testScore))
+# calculate mean absolute error
+testScore = mean_absolute_error(series_test.values(), prediction_.values())
+print('Test Score: %.2f MAE' % (testScore))
+# RMSE
+print('Test Score: %.2f RMSE' % np.sqrt(testScore))
+# calculate r2 score
+testScore = r2_score(series_test.values(), prediction_.values())
+print('Test Score: %.2f R2' % (testScore))
+
+plt.figure(figsize=(4, 3))
+plt.plot(np.arange(1,16),series_test.values()[-16:], '-->', linewidth=2.0)
+plt.plot(np.arange(16, 17),prediction.values()[0].astype(int), 'o')
+plt.plot(np.arange(16, 17),prediction.values()[0].astype(int), 'h')
+plt.plot(np.arange(16, 17),prediction.values()[0].astype(int), '*')
+# round the number in axis
+plt.gca().yaxis.set_major_locator(plt.MaxNLocator(integer=True))
+plt.gca().xaxis.set_major_locator(plt.MaxNLocator(integer=True))
+plt.ylabel('Days')
+plt.xlabel('Months')
+plt.legend(['Cycle serie', 'ARIMA', 'LSTM', 'Lasso'], loc='lower left')
+plt.title('Case 1: Predicting the next cycle duration')
+plt.grid(True)
diff --git a/period_cycle_prediction/models/lstm_simulation_case1.py b/period_cycle_prediction/models/lstm_simulation_case1.py
@@ -0,0 +1,55 @@
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+import tensorflow as tf
+from tensorflow.keras.models import Sequential
+from tensorflow.keras.layers import Dense, LSTM, Dropout
+from sklearn.preprocessing import MinMaxScaler
+from sklearn.metrics import mean_squared_error
+from sklearn.metrics import mean_absolute_error
+from sklearn.metrics import r2_score
+from sklearn.metrics import explained_variance_score
+from sklearn.metrics import max_error
+from utils.utils import generate_synthetic_data, generate_final_features, split_dataset
+import warnings 
+warnings.filterwarnings("ignore")
+
+if __name__ == '__main__': 
+
+ total_regular_cycle_data = pd.read_csv('dataset\\total_regular_cycle_data.csv')
+ features_total_regular_cycle_data, labels_total_regular_cycle_data = generate_final_features(total_regular_cycle_data)
+ input_train_total_regular_cycle, input_test_total_regular_cycle, output_train_total_regular_cycle, output_test_total_regular_cycle = split_dataset(features_total_regular_cycle_data, labels_total_regular_cycle_data, reshape=False)
+
+ # create and fit the LSTM network
+ n_features = input_train_total_regular_cycle.shape[2]
+ model = Sequential()
+ model.add(LSTM(64, input_shape=(3, n_features), activation='tanh'))
+ model.add(Dropout(0.05))
+ model.add(Dense(n_features, activation='relu'))
+ model.summary()
+
+ opt=tf.keras.optimizers.Adam(learning_rate=0.01)
+ model.compile(loss='mean_squared_error', optimizer=opt, run_eagerly=True)
+ # add early stopping
+ early_stopping = tf.keras.callbacks.EarlyStopping( monitor='val_loss', patience=5, restore_best_weights=True)
+
+ history = model.fit(input_train_total_regular_cycle, output_train_total_regular_cycle, epochs=100, batch_size=32, validation_split=0.2, verbose=1, callbacks=[early_stopping])
+
+ # plot the loss and val loss
+ plt.figure(figsize=(4, 3))
+ plt.plot(history.history['loss'], '-', linewidth=2)
+ plt.plot(history.history['val_loss'], '--', linewidth=2)
+ plt.grid(True)
+ plt.legend(['Train', 'Validation'])
+ plt.ylabel('Loss')
+ plt.xlabel('Epoch')
+ plt.title('Case 1: LSTM model loss')
+ ax = plt.axes([0.6, 0.4, .20, .20])
+ plt.plot(history.history['loss'], '-', linewidth=2)
+ plt.plot(history.history['val_loss'], '--', linewidth=2)
+ plt.grid(True)
+ ax.set_ylim(0.1, 3)
+ ax.set_xlim(70, 93)
+ # save history
+ history_df = pd.DataFrame(history.history)
+ history_df.to_csv('case1_history_lstm.csv', index=False)
diff --git a/period_cycle_prediction/models/lstm_simulation_case2.py b/period_cycle_prediction/models/lstm_simulation_case2.py
@@ -0,0 +1,70 @@
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+import tensorflow as tf
+from tensorflow.keras.models import Sequential
+from tensorflow.keras.layers import Dense, LSTM, Dropout
+from sklearn.preprocessing import MinMaxScaler
+from sklearn.metrics import mean_squared_error
+from sklearn.metrics import mean_absolute_error
+from sklearn.metrics import r2_score
+from sklearn.metrics import explained_variance_score
+from sklearn.metrics import max_error
+from utils.utils import generate_synthetic_data, generate_final_features, split_dataset
+import warnings 
+warnings.filterwarnings("ignore")
+
+if __name__ == '__main__': 
+
+ regular_cycle_data = pd.read_csv('dataset/regular_cycle_data.csv')
+ features_regular_cycle_data, labels_regular_cycle_data = generate_final_features(regular_cycle_data)
+ input_train_regular_cycle, input_test_regular_cycle, output_train_regular_cycle, output_test_regular_cycle = split_dataset(features_regular_cycle_data, labels_regular_cycle_data, reshape=False)
+
+ n_features = input_train_regular_cycle.shape[2]
+ model = Sequential()
+ model.add(LSTM(units=128, return_sequences=True, input_shape=(input_train_regular_cycle.shape[1], input_train_regular_cycle.shape[2])))
+ model.add(Dropout(0.2))
+ model.add(LSTM(units=64, return_sequences=True))
+ model.add(Dropout(0.2))
+ model.add(LSTM(units=32))
+ model.add(Dropout(0.2))
+ model.add(Dense(units=n_features, activation='relu'))
+
+ opt=tf.keras.optimizers.Adam()
+ model.compile(loss='mean_squared_error', optimizer=opt)
+ # add early stopping
+ early_stopping = tf.keras.callbacks.EarlyStopping( monitor='val_loss', patience=10, restore_best_weights=True)
+
+ history = model.fit(input_train_regular_cycle, output_train_regular_cycle, epochs=2000, batch_size=32, validation_split=0.2, callbacks=[early_stopping])
+
+ # plot the loss and val loss
+ plt.figure(figsize=(4, 3))
+ plt.plot(history.history['loss'], '-', linewidth=2)
+ plt.plot(history.history['val_loss'], '--', linewidth=2)
+ plt.grid(True)
+ plt.legend(['Train', 'Validation'])
+ plt.ylabel('Loss')
+ plt.xlabel('Epoch')
+ plt.title('Case 2: LSTM model loss')
+ # log scale
+ #plt.yscale('log')
+
+ # add a zoom in epoch 70 to 100
+ ax = plt.axes([0.6, 0.4, .20, .20])
+ plt.plot(history.history['loss'], '-', linewidth=2)
+ plt.plot(history.history['val_loss'], '--', linewidth=2)
+ plt.grid(True)
+ ax.set_ylim(1, 6)
+ ax.set_xlim(1500, 1650)
+
+
+ # save history
+ history_df = pd.DataFrame(history.history)
+ history_df.to_csv('case2_history_lstm.csv', index=False)
+
+ # save figure
+ fig = plt.gcf()
+ fig.savefig('case2_loss_lstm.eps', dpi=300, bbox_inches='tight')
+
+ # save model 
+ model.save('case2_lstm_model.h5')