|
| 1 | +import pandas as pd |
| 2 | +import seaborn as sns |
| 3 | +import matplotlib.pyplot as plt |
| 4 | +from sklearn.model_selection import train_test_split |
| 5 | +from sklearn.linear_model import LinearRegression |
| 6 | +from sklearn.metrics import mean_squared_error, r2_score |
| 7 | + |
| 8 | +diamonds = pd.read_csv('diamonds.csv') |
| 9 | + |
| 10 | +print(diamonds.head()) |
| 11 | +print(diamonds.info()) |
| 12 | +print(diamonds.describe()) |
| 13 | + |
| 14 | +diamonds = diamonds.dropna() |
| 15 | + |
| 16 | +sns.scatterplot(x='carat', y='price', data=diamonds) |
| 17 | +plt.title('Carat vs Price') |
| 18 | +plt.show() |
| 19 | + |
| 20 | +# Select only numeric columns for correlation analysis |
| 21 | +numeric_columns = diamonds.select_dtypes(include=['number']).columns |
| 22 | +correlation_matrix = diamonds[numeric_columns].corr() |
| 23 | + |
| 24 | +sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm') |
| 25 | +plt.title('Correlation Matrix') |
| 26 | +plt.show() |
| 27 | + |
| 28 | +X = diamonds[['carat', 'depth', 'table', 'x', 'y', 'z']] # Exclude non-numeric columns |
| 29 | +y = diamonds['price'] |
| 30 | +X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) |
| 31 | + |
| 32 | +model = LinearRegression() |
| 33 | +model.fit(X_train, y_train) |
| 34 | + |
| 35 | +y_pred = model.predict(X_test) |
| 36 | +mse = mean_squared_error(y_test, y_pred) |
| 37 | +r2 = r2_score(y_test, y_pred) |
| 38 | + |
| 39 | +print(f'Mean Squared Error: {mse}') |
| 40 | +print(f'R-squared: {r2}') |
0 commit comments