jbrea
diff --git a/‎CondaPkg.toml
Lines changed: 1 addition & 0 deletions b/‎CondaPkg.toml
Lines changed: 1 addition & 0 deletions
diff --git a/‎Dockerfile
Lines changed: 1 addition & 1 deletion b/‎Dockerfile
Lines changed: 1 addition & 1 deletion
diff --git a/‎README.md
Lines changed: 3 additions & 1 deletion b/‎README.md
Lines changed: 3 additions & 1 deletion
diff --git a/‎notebooks/clustering.jl
Lines changed: 19 additions & 7 deletions b/‎notebooks/clustering.jl
Lines changed: 19 additions & 7 deletions
diff --git a/‎notebooks/flexibility.jl
Lines changed: 0 additions & 2 deletions b/‎notebooks/flexibility.jl
Lines changed: 0 additions & 2 deletions
diff --git a/‎notebooks/generalized_linear_regression.jl
Lines changed: 21 additions & 10 deletions b/‎notebooks/generalized_linear_regression.jl
Lines changed: 21 additions & 10 deletions
diff --git a/‎notebooks/gradient_descent.jl
Lines changed: 2 additions & 0 deletions b/‎notebooks/gradient_descent.jl
Lines changed: 2 additions & 0 deletions
@@ -1,6 +1,7 @@
 [deps]
 scikit-learn = ""
 openml = ""
+tqdm = ""
 pandas = ""
 matplotlib = ""
 plotly = ""
 
@@ -25,4 +25,4 @@ COPY --chown=MLCourse notebooks/.cache notebooks/.cache
 RUN julia -e 'import Pkg; Pkg.activate(joinpath(Pkg.devdir(), "MLCourse")); Pkg.instantiate(); Pkg.activate(joinpath(Pkg.devdir(), "MLCourse", "RLEnv")); Pkg.instantiate();'
 
 # The "default command" for this docker thing.
-CMD ["julia", "--project=/home/MLCourse", "-e", "import PlutoSliderServer; PlutoSliderServer.run_git_directory(\".\"; Export_baked_notebookfile = false, SliderServer_port=8000, SliderServer_exclude = [\"extras/transfer_learning.jl\", \"extras/generative_models.jl\"], Export_exclude = [\"extras/transfer_learning.jl\", \"extras/generative_models.jl\"], SliderServer_host=\"0.0.0.0\", Export_slider_server_url=\"https://bio322.epfl.ch/\", Export_binder_url = \"https://mybinder.org/v2/gh/jbrea/MLCourse/binder\")"]
+CMD ["julia", "--project=/home/MLCourse", "-e", "import PlutoSliderServer; PlutoSliderServer.run_git_directory(\".\"; Export_baked_notebookfile = false, SliderServer_port=8000, SliderServer_exclude = [\"extras/transfer_learning.jl\", \"extras/generative_models.jl\"], Export_exclude = [\"extras/transfer_learning.jl\", \"extras/generative_models.jl\"], SliderServer_host=\"0.0.0.0\", Export_slider_server_url=\"https://bio322.epfl.ch/\")"]
@@ -1,7 +1,9 @@
 # MLCourse
 
 This repository contains teaching material for an introductory machine learning course.
-You can find an interactive preview of the Pluto notebooks of this course [here](https://bio322.epfl.ch) and you can run some notebooks on [mybinder](https://mybinder.org/v2/gh/jbrea/MLCourse/binder?urlpath=pluto/open?path%3D/home/jovyan/MLCourse/index.jl) (some notebooks will crash on mybinder when they hit the memory limit).
+
+**Students of my course do not need to pull this repository or follow the instructions
+below. This repository is mostly used to create the interactive websites on [https://bio322.epfl.ch](https://bio322.epfl.ch).**
 
 ## Installation
 
 
@@ -93,7 +93,9 @@ using StatsPlots
 """
 ,
 """
+import matplotlib.pyplot as plt
 import seaborn as sns
+
 g = sns.PairGrid(iris.drop(["class"], axis =1))
 g.map_lower(sns.regplot, line_kws={'color': 'black'})
 g.map_diag(sns.histplot, color = 'darkorange' )
@@ -232,11 +234,15 @@ predict(hc, select(iris, Not(:class)))
 ,
 """
 from sklearn.cluster import AgglomerativeClustering
-hc = make_pipeline(StandardScaler(), AgglomerativeClustering(distance_threshold=0, n_clusters = None, metric = "euclidean", linkage = "complete")).fit(iris.drop(["class"], axis =1))
-hc
-hc
+hc = make_pipeline(StandardScaler(),
+                   AgglomerativeClustering(distance_threshold=0,
+                                           n_clusters = None,
+                                           metric = "euclidean",
+                                           linkage = "complete"))
+hc.fit(iris.drop(["class"], axis = 1))
 """
 ,
+py_showoutput = false,
 cache_jl_vars = [:hc]
 )
 
@@ -323,16 +329,21 @@ md"""Seed of random number generator $(@bind seed Slider(collect(1:50), show_val
 
 Cluster assignment with $(@bind method Select(["DBSCAN", "k-means", "hierarchical clustering"]))"""
 
+# ╔═╡ 52f99e00-7493-4d56-8557-511e897223bb
+md"Here is an example of how DBSCAN can be used."
+
 # ╔═╡ 3c6c668f-d5a0-48f8-8f87-e448e71f4554
 mlcode("""
-nothing""",
+X = DataFrame(X1 = [1., 2, 2, 8, 8, 25], X2 = [2., 2, 3, 7, 8, 80])
+predict(machine(DBSCAN()), X)
+""",
 """
-# here is the python code to run DBSCAN
 from sklearn.cluster import DBSCAN
 import numpy as np
 X = np.array([[1, 2], [2, 2], [2, 3],
                [8, 7], [8, 8], [25, 80]])
 clustering = DBSCAN(eps=3, min_samples=2).fit(X)
+clustering.labels_
 """
 )
 
@@ -555,7 +566,7 @@ md"to fit an unsupervised machine called `PCA` (with at most 2 output dimensions
 # ╟─d1c88a44-be52-4b0e-bc23-cca00d10ffb6
 # ╟─0e7f34b9-e24f-447b-840b-e2750d2e778b
 # ╟─70b3f1bb-7c47-4bb0-aa17-cda6fdbe0469
-# ╠═260a1fb7-58b8-4d83-b8d7-a0bd8e6836ac
+# ╟─260a1fb7-58b8-4d83-b8d7-a0bd8e6836ac
 # ╟─b5165fda-1bdd-4837-9eed-42ce9db40529
 # ╟─fdf190cc-0426-4327-a710-80fe2ead632c
 # ╟─ab161204-bbcd-4608-ae67-fcde39b2539b
@@ -569,7 +580,8 @@ md"to fit an unsupervised machine called `PCA` (with at most 2 output dimensions
 # ╟─6d845685-ac31-4df7-9d18-f1fab6c08e3d
 # ╟─9ca4cac1-f378-42cd-ba60-d174a47e23a8
 # ╟─8ea10eb7-8b37-4026-a7ec-e44bba7532ea
-# ╠═3c6c668f-d5a0-48f8-8f87-e448e71f4554
+# ╟─52f99e00-7493-4d56-8557-511e897223bb
+# ╟─3c6c668f-d5a0-48f8-8f87-e448e71f4554
 # ╟─9d54fbb8-44f8-46c8-90ef-de85746c410b
 # ╟─85d574c2-b823-4dcf-b711-efc755e724b7
 # ╟─1ed55c9f-1301-4553-84ee-26ada25f9b76
 
@@ -651,8 +651,6 @@ test_predictions = knn.predict(X_test)
 mnist_errorrate = np.mean(test_predictions != y_test)
 mnist_errorrate
 """
-;
-eval = false
 )
 
 # ╔═╡ 1c7ad8e3-dae7-4217-b528-bb0d3d1d5331
 
@@ -176,12 +176,12 @@ from sklearn.linear_model import LogisticRegression
 
 mach3 = LogisticRegression(penalty=None)
 mach3.fit(
-    classification_data['x'].values.reshape(-1, 1), 
+    classification_data['x'].values.reshape(-1, 1),
     classification_data['y']
     )
 ("coeff : ", mach3.coef_ , "intercept", mach3.intercept_)
 """
-	
+,
 )
 
 # ╔═╡ a9c7ca33-ce22-49b3-a976-8c180916fa5e
@@ -220,7 +220,7 @@ mlstring(md"
 If we want to extract the probability of a given response, we can use the `pdf` function."
 ,
 "
-The probability of a given response is store in one column of p. For the probability of response A : 
+The probability of a given response is store in one column of p. For the probability of response A :
 ")
 
 # ╔═╡ 5224d406-4e02-424d-9502-a22e0614cb96
@@ -279,6 +279,7 @@ def ll(theta):
 
 ll([-1.28858, 0.338548]) # the parameters we obtained above
 """
+,
 )
 
 # ╔═╡ b8b81c1b-0faf-4ce9-b690-2f6cc9542b0f
@@ -299,6 +300,7 @@ from sklearn.metrics import log_loss
 classification_data["y"].values
 -log_loss(classification_data["y"].values, mach3.predict_proba(classification_data['x'].values.reshape(-1, 1)), normalize=False)
 """
+,
 )
 
 # ╔═╡ f7117513-283f-4e32-a2a1-3594c794c94d
@@ -412,8 +414,9 @@ from sklearn.feature_extraction.text import CountVectorizer
 
 vectorizer = CountVectorizer()
 word_counts = vectorizer.fit_transform(spamdata["text"].values[:2000]).toarray()
- 
 """
+,
+py_showoutput = false
 )
 
 # ╔═╡ a37baeec-4252-40bd-8022-88cbedc504ed
@@ -476,8 +479,9 @@ predict(m3)
 """
 m3 = LogisticRegression(penalty=None, max_iter = 1000)
 m3.fit(normalized_word_counts, spam_or_ham)
- 
 """
+,
+py_showoutput = false
 )
 
 # ╔═╡ 21b66582-3fda-401c-9421-73ae2f455a75
@@ -501,6 +505,7 @@ confusion_matrix(predict_mode(m3), spam_or_ham)
 from sklearn.metrics import confusion_matrix
 confusion_matrix(m3.predict(normalized_word_counts), spam_or_ham)
 """
+,
 )
 
 # ╔═╡ 4e4f4adf-364f-49b9-9391-5050a4c1286a
@@ -523,6 +528,7 @@ test_input = vectorizer.fit_transform(spamdata["text"].values[2000:]).toarray()
 test_labels = list(spamdata["label"][2000:])
 confusion_matrix(m3.predict(test_input), test_labels)
 """
+,
 )
 
 # ╔═╡ ef9489c3-2bff-431b-92c4-f1b9778040cf
@@ -570,6 +576,7 @@ training_auc = auc(fprs1, tprs1)
 test_auc = auc(fprs2, tprs2)
 (training_auc, test_auc)
 """
+,
 )
 
 # ╔═╡ a30578dd-aecb-46eb-b947-f009282cf2fc
@@ -600,6 +607,7 @@ def losses(machine, input, response):
 
 losses(m3, normalized_word_counts, spam_or_ham)
 """
+,
 )
 
 
@@ -612,6 +620,7 @@ losses(m3, test_input, test_labels)
 """
 losses(m3, test_input, test_labels)
 """
+,
 )
 
 # ╔═╡ b6689b27-e8a2-44e4-8791-ce237767ee63
@@ -650,16 +659,16 @@ DataFrame(schema(bikesharing))
 ,
 """
 bikesharing.dropna(inplace=True) # remove rows with missing data
-bikesharing.dtypes.to_frame().reset_index()
 """
 ,
 cache = false
 )
 
 # ╔═╡ b9ba1df0-5086-4c0f-a2c9-200c2be27294
-md"Above we see that the `:count` column is detected as `Continuous`, whereas it should be `Count`. We will therefore coerce it to the correct scientific type in the first line of the cell below.
+mlstring(md"Above we see that the `:count` column is detected as `Continuous`, whereas it should be `Count`. We will therefore coerce it to the correct scientific type in the first line of the cell below.
 
-For count variables we can use Poisson regression. Following the standard recipe, we parametrize ``f(x) = \theta_0 + \theta_1 x_1 + \cdots +\theta_d x_d``, plug this into the formula of the Poisson distribution and fit the parameters ``\theta_0, \ldots, \theta_d`` by maximizing the log-likelihood. In `MLJ` this is done by the `CountRegressor()`."
+For count variables we can use Poisson regression. Following the standard recipe, we parametrize ``f(x) = \theta_0 + \theta_1 x_1 + \cdots +\theta_d x_d``, plug this into the formula of the Poisson distribution and fit the parameters ``\theta_0, \ldots, \theta_d`` by maximizing the log-likelihood. In `MLJ` this is done by the `CountRegressor()`.",
+md"")
 
 # ╔═╡ 81c55206-bf59-4c4e-ac5e-77a46e31bec7
 mlcode(
@@ -682,6 +691,7 @@ m4.fit(bikesharing[['temp', 'humidity']], bikesharing['count']) # Fitting the mo
 
 m4.coef_ # Retrieving the fitted parameters
 """
+,
 )
 
 # ╔═╡ 6ea40424-22a0-42b9-bfab-8d4903ab8d64
@@ -698,6 +708,7 @@ predict(m4)
 """
 m4.predict(bikesharing[['temp', 'humidity']])
 """
+,
 )
 
 # ╔═╡ aa96bbe7-49f4-4244-9c71-8d9b2b3ee065
@@ -788,7 +799,7 @@ In the multiple linear regression of the weather data set above we used all
 md"""
 #### Exercise 5
 - Read the section on [scientific types in the MLJ manual](https://alan-turing-institute.github.io/MLJ.jl/dev/getting_started/#Data-containers-and-scientific-types).
-- Coerce the `count` variable of the bike sharing data to `Continuous` and fit a linear model (`LinearRegressor`) with predictors `:temp` and `:humidity`. 
+- Coerce the `count` variable of the bike sharing data to `Continuous` and fit a linear model (`LinearRegressor`) with predictors `:temp` and `:humidity`.
 Create a scatter plot with the true counts `bikesharing.count` on the x-axis and the predicted mode (`predict_mode`) of the counts for the linear regression model and the Poisson model on the y-axis. If the model perfectly captures the data, the plotted points should lie on the diagonal; you can add `plot!(identity)` to the figure to display the diagonal.
 Comment on the differences you see in the plot between the Poisson model and the linear regression model.
 """
@@ -917,7 +928,7 @@ begin
 end;
 
 # ╔═╡ 4f89ceab-297f-4c2c-9029-8d2d7fad084f
-let 
+let
 	Random.seed!(17)
     xgrid = -3:.25:3; ygrid = -3:.25:3
     wireframe = [[PP.scatter3d(x = fill(x, length(ygrid)),
 
@@ -696,3 +696,5 @@ mlcode("rand(100)", nothing)
 # ╟─8459f86e-bce7-4839-9c51-57335ac6353c
 # ╟─7aa547f8-25d4-488d-9fc3-f633f7f03f57
 # ╟─9d250061-e570-4537-b1aa-f6a9019f343d
+# ╠═fac88373-4f6d-4661-9a1c-be173a725d4b
+# ╠═87aa211a-c383-4da2-b18a-f06567a4d3bf
Original file line number	Diff line number	Diff line change
`@@ -651,8 +651,6 @@ test_predictions = knn.predict(X_test)`
`651`	`651`	`mnist_errorrate = np.mean(test_predictions != y_test)`
`652`	`652`	`mnist_errorrate`
`653`	`653`	`"""`
`654`		`-;`
`655`		`-eval = false`
`656`	`654`	`)`
`657`	`655`
`658`	`656`	`# ╔═╡ 1c7ad8e3-dae7-4217-b528-bb0d3d1d5331`