Skip to content

Commit 45c676d

Browse files
committed
add new
1 parent c2c351c commit 45c676d

File tree

10 files changed

+1059
-0
lines changed

10 files changed

+1059
-0
lines changed

.idea/data_combat.iml

Lines changed: 12 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

.idea/misc.xml

Lines changed: 4 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

.idea/modules.xml

Lines changed: 8 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

.idea/vcs.xml

Lines changed: 6 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

.idea/workspace.xml

Lines changed: 257 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.
Binary file not shown.
Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
# -*- coding: utf-8 -*-
2+
# 乳腺癌诊断分类
3+
import pandas as pd
4+
import matplotlib.pyplot as plt
5+
import seaborn as sns
6+
from sklearn.model_selection import train_test_split
7+
from sklearn import svm
8+
from sklearn import metrics
9+
from sklearn.preprocessing import StandardScaler
10+
11+
# 加载数据集,你需要把数据放到目录中
12+
data = pd.read_csv("./data.csv")
13+
14+
# 数据探索
15+
# 因为数据集中列比较多,我们需要把dataframe中的列全部显示出来
16+
pd.set_option('display.max_columns', None)
17+
print(data.columns)
18+
print(data.head(5))
19+
print(data.describe())
20+
21+
# 将特征字段分成3组
22+
features_mean= list(data.columns[2:12])
23+
features_se= list(data.columns[12:22])
24+
features_worst=list(data.columns[22:32])
25+
26+
# 数据清洗
27+
# ID列没有用,删除该列
28+
data.drop("id",axis=1,inplace=True)
29+
# 将B良性替换为0,M恶性替换为1
30+
data['diagnosis']=data['diagnosis'].map({'M':1,'B':0})
31+
32+
# 将肿瘤诊断结果可视化
33+
sns.countplot(data['diagnosis'],label="Count")
34+
plt.show()
35+
# 用热力图呈现features_mean字段之间的相关性
36+
corr = data[features_mean].corr()
37+
plt.figure(figsize=(14,14))
38+
# annot=True显示每个方格的数据
39+
sns.heatmap(corr, annot=True)
40+
plt.show()
41+
42+
43+
# 特征选择
44+
#features_remain = ['radius_mean','texture_mean', 'smoothness_mean','compactness_mean','symmetry_mean', 'fractal_dimension_mean']
45+
features_remain = data.columns[1:31]
46+
print(features_remain)
47+
print('-'*100)
48+
# 抽取30%的数据作为测试集,其余作为训练集
49+
train, test = train_test_split(data, test_size = 0.3)# in this our main data is splitted into train and test
50+
# 抽取特征选择的数值作为训练和测试数据
51+
train_X = train[features_remain]
52+
train_y=train['diagnosis']
53+
test_X= test[features_remain]
54+
test_y =test['diagnosis']
55+
56+
# 采用Z-Score规范化数据,保证每个特征维度的数据均值为0,方差为1
57+
ss = StandardScaler()
58+
train_X = ss.fit_transform(train_X)
59+
test_X = ss.transform(test_X)
60+
61+
# 创建SVM分类器
62+
model = svm.LinearSVC()
63+
# 用训练集做训练
64+
model.fit(train_X,train_y)
65+
# 用测试集做预测
66+
prediction=model.predict(test_X)
67+
print('准确率: ', metrics.accuracy_score(prediction,test_y))

0 commit comments

Comments
 (0)