|
| 1 | +import pandas as pd |
| 2 | +from sklearn.model_selection import train_test_split, GridSearchCV |
| 3 | +from sklearn.svm import SVC |
| 4 | +from sklearn.metrics import accuracy_score, confusion_matrix, classification_report |
| 5 | +from sklearn.preprocessing import StandardScaler |
| 6 | +from sklearn.pipeline import Pipeline |
| 7 | +import ta |
| 8 | +import numpy as np |
| 9 | +import matplotlib.pyplot as plt |
| 10 | + |
| 11 | +# Load the sentiment data from the Excel file |
| 12 | +sentiment_file_path = '/Users/umuteyidogan/Desktop/IGP_Project/Daily_Sentiment_Analysis_Lem_Headline.xlsx' |
| 13 | +sentiment_data = pd.read_excel(sentiment_file_path) |
| 14 | + |
| 15 | +# Load the Bitcoin price data from the CSV file |
| 16 | +bitcoin_file_path = '/Users/umuteyidogan/Desktop/IGP_Project/bitcoin_price_with_5_labels_2.csv' |
| 17 | +bitcoin_data = pd.read_csv(bitcoin_file_path) |
| 18 | + |
| 19 | +# Load the trading volume data with labels from the CSV file |
| 20 | +trading_volume_file_path = '/Users/umuteyidogan/Desktop/IGP_Project/trading_volume_with_labels.csv' |
| 21 | +trading_volume_data = pd.read_csv(trading_volume_file_path) |
| 22 | + |
| 23 | +# Ensure the date formats are consistent and convert to datetime |
| 24 | +sentiment_data['Published date'] = pd.to_datetime(sentiment_data['Published date']) |
| 25 | +bitcoin_data['Date'] = pd.to_datetime(bitcoin_data['Date']) |
| 26 | +trading_volume_data['Date'] = pd.to_datetime(trading_volume_data['Date']) |
| 27 | + |
| 28 | +# Merge the sentiment data with the Bitcoin price data on the date |
| 29 | +merged_data = pd.merge(sentiment_data, bitcoin_data, left_on='Published date', right_on='Date', how='inner') |
| 30 | + |
| 31 | +# Merge the resulting data with the trading volume data |
| 32 | +final_data = pd.merge(merged_data, trading_volume_data, on='Date', how='inner') |
| 33 | + |
| 34 | +# Calculate technical indicators |
| 35 | +final_data['SMA_7'] = ta.trend.sma_indicator(final_data['Close'], window=7) |
| 36 | +final_data['EMA_14'] = ta.trend.ema_indicator(final_data['Close'], window=14) |
| 37 | +final_data['RSI'] = ta.momentum.rsi(final_data['Close'], window=14) |
| 38 | +final_data['MACD'] = ta.trend.macd(final_data['Close']) |
| 39 | +final_data['MACD_Signal'] = ta.trend.macd_signal(final_data['Close']) |
| 40 | +final_data['Bollinger_High'] = ta.volatility.bollinger_hband(final_data['Close']) |
| 41 | +final_data['Bollinger_Low'] = ta.volatility.bollinger_lband(final_data['Close']) |
| 42 | + |
| 43 | +# Drop rows with NaN values caused by the indicators calculation |
| 44 | +final_data = final_data.dropna() |
| 45 | + |
| 46 | +# Select relevant columns for the final dataset |
| 47 | +final_data = final_data[['Published date', 'Positive_Percentage', 'Negative_Percentage', 'Neutral_Percentage', |
| 48 | + 'Volume', 'SMA_7', 'EMA_14', 'RSI', 'MACD', 'MACD_Signal', 'Bollinger_High', |
| 49 | + 'Bollinger_Low', 'Close', 'Label']] |
| 50 | + |
| 51 | +# Shuffle the dataset to remove any ordering bias |
| 52 | +final_data = final_data.sample(frac=1, random_state=42).reset_index(drop=True) |
| 53 | + |
| 54 | +# Define the features and target variable |
| 55 | +features = final_data[['Positive_Percentage', 'Negative_Percentage', 'Neutral_Percentage', 'Volume', |
| 56 | + 'SMA_7', 'EMA_14', 'RSI', 'MACD', 'MACD_Signal', 'Bollinger_High', |
| 57 | + 'Bollinger_Low']] |
| 58 | +target = final_data['Label'] |
| 59 | + |
| 60 | +# Split the data into training and testing sets |
| 61 | +X_train, X_test, y_train, y_test, close_train, close_test = train_test_split(features, target, final_data['Close'], test_size=0.2, random_state=42) |
| 62 | + |
| 63 | +# Define a pipeline with a scaler and SVM classifier |
| 64 | +pipeline = Pipeline([ |
| 65 | + ('scaler', StandardScaler()), # Standardize features |
| 66 | + ('svc', SVC(probability=True)) # SVM Classifier with probability estimates |
| 67 | +]) |
| 68 | + |
| 69 | +# Define the parameter grid for GridSearchCV |
| 70 | +param_grid = { |
| 71 | + 'svc__C': [0.1, 1, 10, 100], # Regularization parameter |
| 72 | + 'svc__gamma': [1, 0.1, 0.01, 0.001], # Kernel coefficient |
| 73 | + 'svc__kernel': ['linear', 'rbf'] # Specifies the kernel type to be used in the algorithm |
| 74 | +} |
| 75 | + |
| 76 | +# Initialize GridSearchCV with 5-fold cross-validation |
| 77 | +grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=2) |
| 78 | + |
| 79 | +# Fit GridSearchCV |
| 80 | +grid_search.fit(X_train, y_train) |
| 81 | + |
| 82 | +# Get the best parameters |
| 83 | +best_params = grid_search.best_params_ |
| 84 | + |
| 85 | +# Train the SVM model with the best parameters |
| 86 | +best_model = grid_search.best_estimator_ |
| 87 | +best_model.fit(X_train, y_train) |
| 88 | + |
| 89 | +# Predict the probabilities on the test set |
| 90 | +y_pred_prob = best_model.predict_proba(X_test) |
| 91 | + |
| 92 | +# Convert predicted probabilities to integer labels (predictions) |
| 93 | +y_pred = best_model.predict(X_test) |
| 94 | + |
| 95 | +# Evaluate the model |
| 96 | +accuracy = accuracy_score(y_test, y_pred) |
| 97 | +conf_matrix = confusion_matrix(y_test, y_pred) |
| 98 | +class_report = classification_report(y_test, y_pred) |
| 99 | + |
| 100 | +# Display the results |
| 101 | +print(f"Best parameters: {best_params}") |
| 102 | +print(f"Accuracy: {accuracy}") |
| 103 | +print("Confusion Matrix:") |
| 104 | +print(conf_matrix) |
| 105 | +print("Classification Report:") |
| 106 | +print(class_report) |
| 107 | + |
| 108 | +# Verify that the confusion matrix sums up to the total number of test instances |
| 109 | +print(f"Total number of test instances: {len(y_test)}") |
| 110 | +print(f"Sum of confusion matrix values: {conf_matrix.sum()}") |
| 111 | + |
| 112 | +# Trading simulation |
| 113 | +# Initialize parameters |
| 114 | +initial_capital = 10000.0 # Starting capital for each person, ensure it is a float |
| 115 | +num_people = 100 # Number of people in each group |
| 116 | +trade_amount = 1000 # Amount to trade each time (Notional value of each trade) |
| 117 | +num_trades = len(y_test) # Number of trades is 390 |
| 118 | + |
| 119 | +# Ensure trades are integers |
| 120 | +model_trades = np.tile(y_pred, (num_people, 1)).astype(int) |
| 121 | + |
| 122 | +# Simulate random trading |
| 123 | +np.random.seed(42) |
| 124 | +random_trades = np.random.choice([0, 1, 2, 3, 4], size=(num_people, num_trades)).astype(int) # Random decisions |
| 125 | + |
| 126 | +# Function to simulate trades with new labels |
| 127 | +def simulate_trades(trades, prices): |
| 128 | + capital = np.full(trades.shape[0], initial_capital, dtype=np.float64) |
| 129 | + for i in range(1, trades.shape[1]): # Start from 1 to avoid index error |
| 130 | + # Calculate the percentage change in price |
| 131 | + pct_change = (prices[i] - prices[i - 1]) / prices[i - 1] |
| 132 | + |
| 133 | + for j in range(trades.shape[0]): |
| 134 | + if trades[j, i] == 0: # Strong Sell |
| 135 | + capital[j] -= 2 * trade_amount * pct_change |
| 136 | + elif trades[j, i] == 1: # Sell |
| 137 | + capital[j] -= trade_amount * pct_change |
| 138 | + elif trades[j, i] == 2: # Hold |
| 139 | + continue |
| 140 | + elif trades[j, i] == 3: # Buy |
| 141 | + capital[j] += trade_amount * pct_change |
| 142 | + elif trades[j, i] == 4: # Strong Buy |
| 143 | + capital[j] += 2 * trade_amount * pct_change |
| 144 | + |
| 145 | + return capital |
| 146 | + |
| 147 | +# Use the Close prices from the test set |
| 148 | +prices = close_test.values |
| 149 | + |
| 150 | +# Simulate random trades |
| 151 | +random_capital_end = simulate_trades(random_trades, prices) |
| 152 | + |
| 153 | +# Simulate model-based trades |
| 154 | +model_capital_end = simulate_trades(model_trades, prices) |
| 155 | + |
| 156 | +# Calculate average ending capital for both strategies |
| 157 | +random_average_end_capital = np.mean(random_capital_end) |
| 158 | +model_average_end_capital = np.mean(model_capital_end) |
| 159 | + |
| 160 | +# Display the results |
| 161 | +print(f"Average ending capital for random strategy: ${random_average_end_capital:.2f}") |
| 162 | +print(f"Average ending capital for model-based strategy: ${model_average_end_capital:.2f}") |
| 163 | + |
| 164 | +# Additional debugging info |
| 165 | +print(f"Random strategy capital range: {random_capital_end.min()} to {random_capital_end.max()}") |
| 166 | +print(f"Model-based strategy capital range: {model_capital_end.min()} to {model_capital_end.max()}") |
| 167 | + |
| 168 | +# Plotting the results |
| 169 | +plt.figure(figsize=(12, 6)) |
| 170 | + |
| 171 | +# Adjust the bins to capture the distributions better |
| 172 | +bins = np.linspace(-100000, 200000, 100) |
| 173 | + |
| 174 | +plt.hist(random_capital_end, bins=bins, alpha=0.7, label='Random Strategy') |
| 175 | +plt.hist(model_capital_end, bins=bins, alpha=0.7, label='Model-Based Strategy') |
| 176 | + |
| 177 | +plt.axvline(random_average_end_capital, color='blue', linestyle='dashed', linewidth=1) |
| 178 | +plt.axvline(model_average_end_capital, color='orange', linestyle='dashed', linewidth=1) |
| 179 | + |
| 180 | +plt.xlabel('Ending Capital') |
| 181 | +plt.ylabel('Frequency') |
| 182 | +plt.legend() |
| 183 | +plt.title('Distribution of Ending Capital for Random and Model-Based Strategies') |
| 184 | +plt.show() |
| 185 | + |
| 186 | +# Let's assume simulate_trades is a function that returns capital over time instead of just the final capital |
| 187 | +def simulate_trades_over_time(trades, prices): |
| 188 | + capital = np.full((trades.shape[0], trades.shape[1]), initial_capital, dtype=np.float64) |
| 189 | + for i in range(1, trades.shape[1]): |
| 190 | + pct_change = (prices[i] - prices[i - 1]) / prices[i - 1] |
| 191 | + for j in range(trades.shape[0]): |
| 192 | + if trades[j, i] == 0: # Strong Sell |
| 193 | + capital[j, i] = capital[j, i - 1] - 2 * trade_amount * pct_change |
| 194 | + elif trades[j, i] == 1: # Sell |
| 195 | + capital[j, i] = capital[j, i - 1] - trade_amount * pct_change |
| 196 | + elif trades[j, i] == 2: # Hold |
| 197 | + capital[j, i] = capital[j, i - 1] |
| 198 | + elif trades[j, i] == 3: # Buy |
| 199 | + capital[j, i] = capital[j, i - 1] + trade_amount * pct_change |
| 200 | + elif trades[j, i] == 4: # Strong Buy |
| 201 | + capital[j, i] = capital[j, i - 1] + 2 * trade_amount * pct_change |
| 202 | + return capital |
| 203 | + |
| 204 | +# Simulate trades over time for both strategies |
| 205 | +random_capital_over_time = simulate_trades_over_time(random_trades, prices) |
| 206 | +model_capital_over_time = simulate_trades_over_time(model_trades, prices) |
| 207 | + |
| 208 | +# Calculate average capital over time |
| 209 | +random_average_capital_over_time = np.mean(random_capital_over_time, axis=0) |
| 210 | +model_average_capital_over_time = np.mean(model_capital_over_time, axis=0) |
| 211 | + |
| 212 | +# Plot the average capital over time for both strategies |
| 213 | +plt.figure(figsize=(12, 6)) |
| 214 | +plt.plot(random_average_capital_over_time, label='Random Strategy', color='red', linestyle='--') |
| 215 | +plt.plot(model_average_capital_over_time, label='Model-based Strategy', color='blue') |
| 216 | +plt.title('Trading Simulation: Capital Over Time') |
| 217 | +plt.xlabel('Trade Number') |
| 218 | +plt.ylabel('Average Capital ($)') |
| 219 | +plt.legend() |
| 220 | +plt.grid(True) |
| 221 | +plt.show() |
0 commit comments