-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path4_intro_to_scikit_and_ml.py
135 lines (96 loc) · 2.7 KB
/
4_intro_to_scikit_and_ml.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
# To add a new cell, type '#%%'
# To add a new markdown cell, type '#%% [markdown]'
#%%
from IPython import get_ipython
#%% [markdown]
#<h1> SIT 720 - Python Intro </h1>
#%% [markdown]
# #### Scikit-Learn Package for ML
#%%
import sys
get_ipython().system(u'{sys.executable} -m pip install -U scikit-learn')
#%%
from sklearn import datasets
#%%
digits = datasets.load_digits()
print(digits)
#%%
print(digits.target)
#%% [markdown]
# Linear regression using SciKit-learn
#%%
from sklearn.linear_model import LinearRegression
import numpy as np
#Generate training data
X = np.random.rand(100, 1)
Y = np.exp(X)
#Create linear model
linearModel = LinearRegression()
#Fit linear model to training data
linearModel.fit(X,Y)
#Generate test data
X_test = np.random.rand(1000,1)
Y_test = linearModel.predict(X_test)
plt.plot(X_test,Y_test, ".r")
plt.plot(X,Y, ".b")
plt.show()
#%% [markdown]
# #### Text analysis with TF-IDF score
#%% [markdown]
# Creating the document
#%%
corpus = [
'This is the first document.',
'This is the second second document.',
'And the third one.',
'Is this the first document?',
]
print ("The corpus is the list: {}".format(corpus))
#%% [markdown]
# Using CountVectorizer in SciKit we can implement tozenisation and occurrence counting in a single class:
#
# Load module for sklearn:
#%%
from sklearn.feature_extraction.text import CountVectorizer
#%% [markdown]
# Initiate module:
#%%
vectoriser = CountVectorizer()
vectoriser
#%% [markdown]
# Create the term freq matrix:
#%%
termFreq = vectoriser.fit_transform(corpus)
vectoriser.get_feature_names()
#%% [markdown]
# Now we can transform the term freq output into an array:
#%%
termFreq.toarray()
#%% [markdown]
# To do a TF-IDF transformation, we use TfidfTransformer:
#%%
from sklearn.feature_extraction.text import TfidfVectorizer
TFvector = TfidfVectorizer()
TFvector
#%%
#Apply to corpus:
tfVectorisation = TFvector.fit_transform(corpus)
tfVectorisation.toarray()
#%% [markdown]
# By default, the tf-idf vectorization returns a sparse matrix. We can see the output by converting it to a dense matrix with:
#%%
print(vectoriser.vocabulary_)
tfVectorisation.todense()
#%%
import sys
get_ipython().system(u'{sys.executable} -m pip install wordcloud')
#%%
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
SOME_TEXT = "A tag cloud is a visual representation for text data, typicallyused to depict keyword metadata on websites, or to visualize free form text. Some more text and tag."
wordcloud = WordCloud(stopwords = STOPWORDS, background_color= 'white', width = 1200,
height = 1000).generate_from_text(SOME_TEXT)
print(wordcloud.words_)
fig = plt.figure()
plt.imshow(wordcloud)
plt.show()