Skip to content

Commit d4d43c1

Browse files
committed
many fixes
1 parent 6b02aa0 commit d4d43c1

14 files changed

+1143
-1011
lines changed

CondaPkg.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
[deps]
22
scikit-learn = ""
33
openml = ""
4+
tqdm = ""
45
pandas = ""
56
matplotlib = ""
67
plotly = ""

Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,4 +25,4 @@ COPY --chown=MLCourse notebooks/.cache notebooks/.cache
2525
RUN julia -e 'import Pkg; Pkg.activate(joinpath(Pkg.devdir(), "MLCourse")); Pkg.instantiate(); Pkg.activate(joinpath(Pkg.devdir(), "MLCourse", "RLEnv")); Pkg.instantiate();'
2626

2727
# The "default command" for this docker thing.
28-
CMD ["julia", "--project=/home/MLCourse", "-e", "import PlutoSliderServer; PlutoSliderServer.run_git_directory(\".\"; Export_baked_notebookfile = false, SliderServer_port=8000, SliderServer_exclude = [\"extras/transfer_learning.jl\", \"extras/generative_models.jl\"], Export_exclude = [\"extras/transfer_learning.jl\", \"extras/generative_models.jl\"], SliderServer_host=\"0.0.0.0\", Export_slider_server_url=\"https://bio322.epfl.ch/\", Export_binder_url = \"https://mybinder.org/v2/gh/jbrea/MLCourse/binder\")"]
28+
CMD ["julia", "--project=/home/MLCourse", "-e", "import PlutoSliderServer; PlutoSliderServer.run_git_directory(\".\"; Export_baked_notebookfile = false, SliderServer_port=8000, SliderServer_exclude = [\"extras/transfer_learning.jl\", \"extras/generative_models.jl\"], Export_exclude = [\"extras/transfer_learning.jl\", \"extras/generative_models.jl\"], SliderServer_host=\"0.0.0.0\", Export_slider_server_url=\"https://bio322.epfl.ch/\")"]

README.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
11
# MLCourse
22

33
This repository contains teaching material for an introductory machine learning course.
4-
You can find an interactive preview of the Pluto notebooks of this course [here](https://bio322.epfl.ch) and you can run some notebooks on [mybinder](https://mybinder.org/v2/gh/jbrea/MLCourse/binder?urlpath=pluto/open?path%3D/home/jovyan/MLCourse/index.jl) (some notebooks will crash on mybinder when they hit the memory limit).
4+
5+
**Students of my course do not need to pull this repository or follow the instructions
6+
below. This repository is mostly used to create the interactive websites on [https://bio322.epfl.ch](https://bio322.epfl.ch).**
57

68
## Installation
79

notebooks/clustering.jl

Lines changed: 19 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -93,7 +93,9 @@ using StatsPlots
9393
"""
9494
,
9595
"""
96+
import matplotlib.pyplot as plt
9697
import seaborn as sns
98+
9799
g = sns.PairGrid(iris.drop(["class"], axis =1))
98100
g.map_lower(sns.regplot, line_kws={'color': 'black'})
99101
g.map_diag(sns.histplot, color = 'darkorange' )
@@ -232,11 +234,15 @@ predict(hc, select(iris, Not(:class)))
232234
,
233235
"""
234236
from sklearn.cluster import AgglomerativeClustering
235-
hc = make_pipeline(StandardScaler(), AgglomerativeClustering(distance_threshold=0, n_clusters = None, metric = "euclidean", linkage = "complete")).fit(iris.drop(["class"], axis =1))
236-
hc
237-
hc
237+
hc = make_pipeline(StandardScaler(),
238+
AgglomerativeClustering(distance_threshold=0,
239+
n_clusters = None,
240+
metric = "euclidean",
241+
linkage = "complete"))
242+
hc.fit(iris.drop(["class"], axis = 1))
238243
"""
239244
,
245+
py_showoutput = false,
240246
cache_jl_vars = [:hc]
241247
)
242248

@@ -323,16 +329,21 @@ md"""Seed of random number generator $(@bind seed Slider(collect(1:50), show_val
323329
324330
Cluster assignment with $(@bind method Select(["DBSCAN", "k-means", "hierarchical clustering"]))"""
325331

332+
# ╔═╡ 52f99e00-7493-4d56-8557-511e897223bb
333+
md"Here is an example of how DBSCAN can be used."
334+
326335
# ╔═╡ 3c6c668f-d5a0-48f8-8f87-e448e71f4554
327336
mlcode("""
328-
nothing""",
337+
X = DataFrame(X1 = [1., 2, 2, 8, 8, 25], X2 = [2., 2, 3, 7, 8, 80])
338+
predict(machine(DBSCAN()), X)
339+
""",
329340
"""
330-
# here is the python code to run DBSCAN
331341
from sklearn.cluster import DBSCAN
332342
import numpy as np
333343
X = np.array([[1, 2], [2, 2], [2, 3],
334344
[8, 7], [8, 8], [25, 80]])
335345
clustering = DBSCAN(eps=3, min_samples=2).fit(X)
346+
clustering.labels_
336347
"""
337348
)
338349

@@ -555,7 +566,7 @@ md"to fit an unsupervised machine called `PCA` (with at most 2 output dimensions
555566
# ╟─d1c88a44-be52-4b0e-bc23-cca00d10ffb6
556567
# ╟─0e7f34b9-e24f-447b-840b-e2750d2e778b
557568
# ╟─70b3f1bb-7c47-4bb0-aa17-cda6fdbe0469
558-
# ╠═260a1fb7-58b8-4d83-b8d7-a0bd8e6836ac
569+
# ╟─260a1fb7-58b8-4d83-b8d7-a0bd8e6836ac
559570
# ╟─b5165fda-1bdd-4837-9eed-42ce9db40529
560571
# ╟─fdf190cc-0426-4327-a710-80fe2ead632c
561572
# ╟─ab161204-bbcd-4608-ae67-fcde39b2539b
@@ -569,7 +580,8 @@ md"to fit an unsupervised machine called `PCA` (with at most 2 output dimensions
569580
# ╟─6d845685-ac31-4df7-9d18-f1fab6c08e3d
570581
# ╟─9ca4cac1-f378-42cd-ba60-d174a47e23a8
571582
# ╟─8ea10eb7-8b37-4026-a7ec-e44bba7532ea
572-
# ╠═3c6c668f-d5a0-48f8-8f87-e448e71f4554
583+
# ╟─52f99e00-7493-4d56-8557-511e897223bb
584+
# ╟─3c6c668f-d5a0-48f8-8f87-e448e71f4554
573585
# ╟─9d54fbb8-44f8-46c8-90ef-de85746c410b
574586
# ╟─85d574c2-b823-4dcf-b711-efc755e724b7
575587
# ╟─1ed55c9f-1301-4553-84ee-26ada25f9b76

notebooks/flexibility.jl

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -651,8 +651,6 @@ test_predictions = knn.predict(X_test)
651651
mnist_errorrate = np.mean(test_predictions != y_test)
652652
mnist_errorrate
653653
"""
654-
;
655-
eval = false
656654
)
657655

658656
# ╔═╡ 1c7ad8e3-dae7-4217-b528-bb0d3d1d5331

notebooks/generalized_linear_regression.jl

Lines changed: 21 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -176,12 +176,12 @@ from sklearn.linear_model import LogisticRegression
176176
177177
mach3 = LogisticRegression(penalty=None)
178178
mach3.fit(
179-
classification_data['x'].values.reshape(-1, 1),
179+
classification_data['x'].values.reshape(-1, 1),
180180
classification_data['y']
181181
)
182182
("coeff : ", mach3.coef_ , "intercept", mach3.intercept_)
183183
"""
184-
184+
,
185185
)
186186

187187
# ╔═╡ a9c7ca33-ce22-49b3-a976-8c180916fa5e
@@ -220,7 +220,7 @@ mlstring(md"
220220
If we want to extract the probability of a given response, we can use the `pdf` function."
221221
,
222222
"
223-
The probability of a given response is store in one column of p. For the probability of response A :
223+
The probability of a given response is store in one column of p. For the probability of response A :
224224
")
225225

226226
# ╔═╡ 5224d406-4e02-424d-9502-a22e0614cb96
@@ -279,6 +279,7 @@ def ll(theta):
279279
280280
ll([-1.28858, 0.338548]) # the parameters we obtained above
281281
"""
282+
,
282283
)
283284

284285
# ╔═╡ b8b81c1b-0faf-4ce9-b690-2f6cc9542b0f
@@ -299,6 +300,7 @@ from sklearn.metrics import log_loss
299300
classification_data["y"].values
300301
-log_loss(classification_data["y"].values, mach3.predict_proba(classification_data['x'].values.reshape(-1, 1)), normalize=False)
301302
"""
303+
,
302304
)
303305

304306
# ╔═╡ f7117513-283f-4e32-a2a1-3594c794c94d
@@ -412,8 +414,9 @@ from sklearn.feature_extraction.text import CountVectorizer
412414
413415
vectorizer = CountVectorizer()
414416
word_counts = vectorizer.fit_transform(spamdata["text"].values[:2000]).toarray()
415-
416417
"""
418+
,
419+
py_showoutput = false
417420
)
418421

419422
# ╔═╡ a37baeec-4252-40bd-8022-88cbedc504ed
@@ -476,8 +479,9 @@ predict(m3)
476479
"""
477480
m3 = LogisticRegression(penalty=None, max_iter = 1000)
478481
m3.fit(normalized_word_counts, spam_or_ham)
479-
480482
"""
483+
,
484+
py_showoutput = false
481485
)
482486

483487
# ╔═╡ 21b66582-3fda-401c-9421-73ae2f455a75
@@ -501,6 +505,7 @@ confusion_matrix(predict_mode(m3), spam_or_ham)
501505
from sklearn.metrics import confusion_matrix
502506
confusion_matrix(m3.predict(normalized_word_counts), spam_or_ham)
503507
"""
508+
,
504509
)
505510

506511
# ╔═╡ 4e4f4adf-364f-49b9-9391-5050a4c1286a
@@ -523,6 +528,7 @@ test_input = vectorizer.fit_transform(spamdata["text"].values[2000:]).toarray()
523528
test_labels = list(spamdata["label"][2000:])
524529
confusion_matrix(m3.predict(test_input), test_labels)
525530
"""
531+
,
526532
)
527533

528534
# ╔═╡ ef9489c3-2bff-431b-92c4-f1b9778040cf
@@ -570,6 +576,7 @@ training_auc = auc(fprs1, tprs1)
570576
test_auc = auc(fprs2, tprs2)
571577
(training_auc, test_auc)
572578
"""
579+
,
573580
)
574581

575582
# ╔═╡ a30578dd-aecb-46eb-b947-f009282cf2fc
@@ -600,6 +607,7 @@ def losses(machine, input, response):
600607
601608
losses(m3, normalized_word_counts, spam_or_ham)
602609
"""
610+
,
603611
)
604612

605613

@@ -612,6 +620,7 @@ losses(m3, test_input, test_labels)
612620
"""
613621
losses(m3, test_input, test_labels)
614622
"""
623+
,
615624
)
616625

617626
# ╔═╡ b6689b27-e8a2-44e4-8791-ce237767ee63
@@ -650,16 +659,16 @@ DataFrame(schema(bikesharing))
650659
,
651660
"""
652661
bikesharing.dropna(inplace=True) # remove rows with missing data
653-
bikesharing.dtypes.to_frame().reset_index()
654662
"""
655663
,
656664
cache = false
657665
)
658666

659667
# ╔═╡ b9ba1df0-5086-4c0f-a2c9-200c2be27294
660-
md"Above we see that the `:count` column is detected as `Continuous`, whereas it should be `Count`. We will therefore coerce it to the correct scientific type in the first line of the cell below.
668+
mlstring(md"Above we see that the `:count` column is detected as `Continuous`, whereas it should be `Count`. We will therefore coerce it to the correct scientific type in the first line of the cell below.
661669
662-
For count variables we can use Poisson regression. Following the standard recipe, we parametrize ``f(x) = \theta_0 + \theta_1 x_1 + \cdots +\theta_d x_d``, plug this into the formula of the Poisson distribution and fit the parameters ``\theta_0, \ldots, \theta_d`` by maximizing the log-likelihood. In `MLJ` this is done by the `CountRegressor()`."
670+
For count variables we can use Poisson regression. Following the standard recipe, we parametrize ``f(x) = \theta_0 + \theta_1 x_1 + \cdots +\theta_d x_d``, plug this into the formula of the Poisson distribution and fit the parameters ``\theta_0, \ldots, \theta_d`` by maximizing the log-likelihood. In `MLJ` this is done by the `CountRegressor()`.",
671+
md"")
663672

664673
# ╔═╡ 81c55206-bf59-4c4e-ac5e-77a46e31bec7
665674
mlcode(
@@ -682,6 +691,7 @@ m4.fit(bikesharing[['temp', 'humidity']], bikesharing['count']) # Fitting the mo
682691
683692
m4.coef_ # Retrieving the fitted parameters
684693
"""
694+
,
685695
)
686696

687697
# ╔═╡ 6ea40424-22a0-42b9-bfab-8d4903ab8d64
@@ -698,6 +708,7 @@ predict(m4)
698708
"""
699709
m4.predict(bikesharing[['temp', 'humidity']])
700710
"""
711+
,
701712
)
702713

703714
# ╔═╡ aa96bbe7-49f4-4244-9c71-8d9b2b3ee065
@@ -788,7 +799,7 @@ In the multiple linear regression of the weather data set above we used all
788799
md"""
789800
#### Exercise 5
790801
- Read the section on [scientific types in the MLJ manual](https://alan-turing-institute.github.io/MLJ.jl/dev/getting_started/#Data-containers-and-scientific-types).
791-
- Coerce the `count` variable of the bike sharing data to `Continuous` and fit a linear model (`LinearRegressor`) with predictors `:temp` and `:humidity`.
802+
- Coerce the `count` variable of the bike sharing data to `Continuous` and fit a linear model (`LinearRegressor`) with predictors `:temp` and `:humidity`.
792803
Create a scatter plot with the true counts `bikesharing.count` on the x-axis and the predicted mode (`predict_mode`) of the counts for the linear regression model and the Poisson model on the y-axis. If the model perfectly captures the data, the plotted points should lie on the diagonal; you can add `plot!(identity)` to the figure to display the diagonal.
793804
Comment on the differences you see in the plot between the Poisson model and the linear regression model.
794805
"""
@@ -917,7 +928,7 @@ begin
917928
end;
918929

919930
# ╔═╡ 4f89ceab-297f-4c2c-9029-8d2d7fad084f
920-
let
931+
let
921932
Random.seed!(17)
922933
xgrid = -3:.25:3; ygrid = -3:.25:3
923934
wireframe = [[PP.scatter3d(x = fill(x, length(ygrid)),

notebooks/gradient_descent.jl

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -696,3 +696,5 @@ mlcode("rand(100)", nothing)
696696
# ╟─8459f86e-bce7-4839-9c51-57335ac6353c
697697
# ╟─7aa547f8-25d4-488d-9fc3-f633f7f03f57
698698
# ╟─9d250061-e570-4537-b1aa-f6a9019f343d
699+
# ╠═fac88373-4f6d-4661-9a1c-be173a725d4b
700+
# ╠═87aa211a-c383-4da2-b18a-f06567a4d3bf

0 commit comments

Comments
 (0)