1
+ {
2
+ "cells" : [
3
+ {
4
+ "cell_type" : " markdown" ,
5
+ "metadata" : {
6
+ "id" : " tzcU5p2gdak9"
7
+ },
8
+ "source" : [
9
+ " # Introducing Email reader in SparkNLP\n " ,
10
+ " This notebook showcases the newly added `sparknlp.read().email()` method in Spark NLP that parses email content from both local file system and distributed file systems into a Spark DataFrame."
11
+ ]
12
+ },
13
+ {
14
+ "cell_type" : " code" ,
15
+ "source" : [
16
+ " from google.colab import drive\n " ,
17
+ " drive.mount('/content/drive')"
18
+ ],
19
+ "metadata" : {
20
+ "colab" : {
21
+ "base_uri" : " https://localhost:8080/"
22
+ },
23
+ "id" : " xrvHhiTAdfGd" ,
24
+ "outputId" : " 07fb7294-33b3-4af0-f4ac-d87e43fd21b6"
25
+ },
26
+ "execution_count" : 1 ,
27
+ "outputs" : [
28
+ {
29
+ "output_type" : " stream" ,
30
+ "name" : " stdout" ,
31
+ "text" : [
32
+ " Mounted at /content/drive\n "
33
+ ]
34
+ }
35
+ ]
36
+ },
37
+ {
38
+ "cell_type" : " code" ,
39
+ "source" : [
40
+ " !cp drive/MyDrive/JSL/sparknlp/sparknlp.jar .\n " ,
41
+ " !cp drive/MyDrive/JSL/sparknlp/spark_nlp-5.5.1-py2.py3-none-any.whl ."
42
+ ],
43
+ "metadata" : {
44
+ "id" : " mjV3NcQ8eA52"
45
+ },
46
+ "execution_count" : 8 ,
47
+ "outputs" : []
48
+ },
49
+ {
50
+ "cell_type" : " code" ,
51
+ "source" : [
52
+ " !pip install pyspark"
53
+ ],
54
+ "metadata" : {
55
+ "colab" : {
56
+ "base_uri" : " https://localhost:8080/"
57
+ },
58
+ "id" : " pEmutNjReCgc" ,
59
+ "outputId" : " 32610063-174f-432b-be4a-6ab2ae9dd709"
60
+ },
61
+ "execution_count" : 3 ,
62
+ "outputs" : [
63
+ {
64
+ "output_type" : " stream" ,
65
+ "name" : " stdout" ,
66
+ "text" : [
67
+ " Requirement already satisfied: pyspark in /usr/local/lib/python3.10/dist-packages (3.5.3)\n " ,
68
+ " Requirement already satisfied: py4j==0.10.9.7 in /usr/local/lib/python3.10/dist-packages (from pyspark) (0.10.9.7)\n "
69
+ ]
70
+ }
71
+ ]
72
+ },
73
+ {
74
+ "cell_type" : " code" ,
75
+ "source" : [
76
+ " !pip install spark_nlp-5.5.1-py2.py3-none-any.whl"
77
+ ],
78
+ "metadata" : {
79
+ "colab" : {
80
+ "base_uri" : " https://localhost:8080/"
81
+ },
82
+ "id" : " 3qjPeDjvfCpA" ,
83
+ "outputId" : " 620c793f-5cb1-4a82-f687-53f3be348d9c"
84
+ },
85
+ "execution_count" : 9 ,
86
+ "outputs" : [
87
+ {
88
+ "output_type" : " stream" ,
89
+ "name" : " stdout" ,
90
+ "text" : [
91
+ " Processing ./spark_nlp-5.5.1-py2.py3-none-any.whl\n " ,
92
+ " Installing collected packages: spark-nlp\n " ,
93
+ " Successfully installed spark-nlp-5.5.1\n "
94
+ ]
95
+ }
96
+ ]
97
+ },
98
+ {
99
+ "cell_type" : " code" ,
100
+ "source" : [
101
+ " # import sparknlp\n " ,
102
+ " # # let's start Spark with Spark NLP\n " ,
103
+ " # spark = sparknlp.start()\n " ,
104
+ " \n " ,
105
+ " from pyspark.sql import SparkSession\n " ,
106
+ " \n " ,
107
+ " spark = SparkSession.builder \\\n " ,
108
+ " .appName(\" SparkNLP\" ) \\\n " ,
109
+ " .master(\" local[*]\" ) \\\n " ,
110
+ " .config(\" spark.driver.memory\" , \" 12G\" ) \\\n " ,
111
+ " .config(\" spark.serializer\" , \" org.apache.spark.serializer.KryoSerializer\" ) \\\n " ,
112
+ " .config(\" spark.kryoserializer.buffer.max\" , \" 2000M\" ) \\\n " ,
113
+ " .config(\" spark.driver.maxResultSize\" , \" 0\" ) \\\n " ,
114
+ " .config(\" spark.jars\" , \" ./sparknlp.jar\" ) \\\n " ,
115
+ " .getOrCreate()\n " ,
116
+ " \n " ,
117
+ " \n " ,
118
+ " print(\" Apache Spark version: {}\" .format(spark.version))"
119
+ ],
120
+ "metadata" : {
121
+ "colab" : {
122
+ "base_uri" : " https://localhost:8080/"
123
+ },
124
+ "id" : " DczWop6QeE8F" ,
125
+ "outputId" : " 714b032f-e076-4aa3-8cf2-10eea6993c4d"
126
+ },
127
+ "execution_count" : 5 ,
128
+ "outputs" : [
129
+ {
130
+ "output_type" : " stream" ,
131
+ "name" : " stdout" ,
132
+ "text" : [
133
+ " Apache Spark version: 3.5.3\n "
134
+ ]
135
+ }
136
+ ]
137
+ },
138
+ {
139
+ "cell_type" : " markdown" ,
140
+ "metadata" : {
141
+ "id" : " RFOFhaEedalB"
142
+ },
143
+ "source" : [
144
+ " ## Setup and Initialization\n " ,
145
+ " Let's keep in mind a few things before we start 😊\n " ,
146
+ " \n " ,
147
+ " Support for reading email files was introduced in Spark NLP 5.5.2. Please make sure you have upgraded to the latest Spark NLP release.\n " ,
148
+ " \n " ,
149
+ " For local files example we will download a couple of email files from Spark NLP Github repo:"
150
+ ]
151
+ },
152
+ {
153
+ "cell_type" : " code" ,
154
+ "execution_count" : 17 ,
155
+ "metadata" : {
156
+ "colab" : {
157
+ "base_uri" : " https://localhost:8080/"
158
+ },
159
+ "id" : " ya8qZe00dalC" ,
160
+ "outputId" : " a9916407-f76d-4c59-fdad-ea17ca0a4326"
161
+ },
162
+ "outputs" : [
163
+ {
164
+ "output_type" : " stream" ,
165
+ "name" : " stdout" ,
166
+ "text" : [
167
+ " mkdir: cannot create directory ‘email-files’: File exists\n " ,
168
+ " --2024-11-13 21:01:15-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/feature/SPARKNLP-1093-Adding-support-to-read-Email-files/src/test/resources/reader/email/email-text-attachments.eml\n " ,
169
+ " Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n " ,
170
+ " Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n " ,
171
+ " HTTP request sent, awaiting response... 200 OK\n " ,
172
+ " Length: 3175 (3.1K) [text/plain]\n " ,
173
+ " Saving to: ‘email-files/email-text-attachments.eml’\n " ,
174
+ " \n " ,
175
+ " email-text-attachme 100%[===================>] 3.10K --.-KB/s in 0s \n " ,
176
+ " \n " ,
177
+ " 2024-11-13 21:01:15 (29.9 MB/s) - ‘email-files/email-text-attachments.eml’ saved [3175/3175]\n " ,
178
+ " \n " ,
179
+ " --2024-11-13 21:01:15-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/feature/SPARKNLP-1093-Adding-support-to-read-Email-files/src/test/resources/reader/email/test-several-attachments.eml\n " ,
180
+ " Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n " ,
181
+ " Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n " ,
182
+ " HTTP request sent, awaiting response... 200 OK\n " ,
183
+ " Length: 1324361 (1.3M) [text/plain]\n " ,
184
+ " Saving to: ‘email-files/test-several-attachments.eml’\n " ,
185
+ " \n " ,
186
+ " test-several-attach 100%[===================>] 1.26M --.-KB/s in 0.05s \n " ,
187
+ " \n " ,
188
+ " 2024-11-13 21:01:16 (26.7 MB/s) - ‘email-files/test-several-attachments.eml’ saved [1324361/1324361]\n " ,
189
+ " \n "
190
+ ]
191
+ }
192
+ ],
193
+ "source" : [
194
+ " !mkdir email-files\n " ,
195
+ " !wget https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/feature/SPARKNLP-1093-Adding-support-to-read-Email-files/src/test/resources/reader/email/email-text-attachments.eml -P email-files\n " ,
196
+ " !wget https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/feature/SPARKNLP-1093-Adding-support-to-read-Email-files/src/test/resources/reader/email/test-several-attachments.eml -P email-files"
197
+ ]
198
+ },
199
+ {
200
+ "cell_type" : " code" ,
201
+ "source" : [
202
+ " !ls -lh ./email-files"
203
+ ],
204
+ "metadata" : {
205
+ "colab" : {
206
+ "base_uri" : " https://localhost:8080/"
207
+ },
208
+ "id" : " 3xgGItNbU2DZ" ,
209
+ "outputId" : " 12f8a7be-f9b4-49ce-a9ab-222142f28293"
210
+ },
211
+ "execution_count" : 18 ,
212
+ "outputs" : [
213
+ {
214
+ "output_type" : " stream" ,
215
+ "name" : " stdout" ,
216
+ "text" : [
217
+ " total 1.3M\n " ,
218
+ " -rw-r--r-- 1 root root 3.2K Nov 13 21:01 email-text-attachments.eml\n " ,
219
+ " -rw-r--r-- 1 root root 1.3M Nov 13 21:01 test-several-attachments.eml\n "
220
+ ]
221
+ }
222
+ ]
223
+ },
224
+ {
225
+ "cell_type" : " markdown" ,
226
+ "metadata" : {
227
+ "id" : " EoFI66NAdalE"
228
+ },
229
+ "source" : [
230
+ " ## Parsing Email from Local Files\n " ,
231
+ " Use the `email()` method to parse email content from local directories."
232
+ ]
233
+ },
234
+ {
235
+ "cell_type" : " code" ,
236
+ "execution_count" : 22 ,
237
+ "metadata" : {
238
+ "colab" : {
239
+ "base_uri" : " https://localhost:8080/"
240
+ },
241
+ "id" : " bAkMjJ1vdalE" ,
242
+ "outputId" : " 4b360b6c-5049-4f10-bb52-60e0e0e52e52"
243
+ },
244
+ "outputs" : [
245
+ {
246
+ "output_type" : " stream" ,
247
+ "name" : " stdout" ,
248
+ "text" : [
249
+ " Warning::Spark Session already created, some configs may not take.\n " ,
250
+ " +--------------------+\n " ,
251
+ " | email|\n " ,
252
+ " +--------------------+\n " ,
253
+ " |[{Title, Email Te...|\n " ,
254
+ " |[{Title, Test Sev...|\n " ,
255
+ " +--------------------+\n " ,
256
+ " \n "
257
+ ]
258
+ }
259
+ ],
260
+ "source" : [
261
+ " import sparknlp\n " ,
262
+ " email_df = sparknlp.read().email(\" ./email-files\" )\n " ,
263
+ " \n " ,
264
+ " email_df.select(\" email\" ).show()"
265
+ ]
266
+ },
267
+ {
268
+ "cell_type" : " code" ,
269
+ "source" : [
270
+ " email_df.printSchema()"
271
+ ],
272
+ "metadata" : {
273
+ "colab" : {
274
+ "base_uri" : " https://localhost:8080/"
275
+ },
276
+ "id" : " 7CMPPubFTeHj" ,
277
+ "outputId" : " 48ee68cf-0f7f-408a-a855-2fd2eb2e8bd1"
278
+ },
279
+ "execution_count" : 21 ,
280
+ "outputs" : [
281
+ {
282
+ "output_type" : " stream" ,
283
+ "name" : " stdout" ,
284
+ "text" : [
285
+ " root\n " ,
286
+ " |-- path: string (nullable = true)\n " ,
287
+ " |-- content: binary (nullable = true)\n " ,
288
+ " |-- email: array (nullable = true)\n " ,
289
+ " | |-- element: struct (containsNull = true)\n " ,
290
+ " | | |-- elementType: string (nullable = true)\n " ,
291
+ " | | |-- content: string (nullable = true)\n " ,
292
+ " | | |-- metadata: map (nullable = true)\n " ,
293
+ " | | | |-- key: string\n " ,
294
+ " | | | |-- value: string (valueContainsNull = true)\n " ,
295
+ " \n "
296
+ ]
297
+ }
298
+ ]
299
+ },
300
+ {
301
+ "cell_type" : " markdown" ,
302
+ "source" : [
303
+ " You can also use DFS like Databricks `dbfs://` or HDFS directories `hdfs://`"
304
+ ],
305
+ "metadata" : {
306
+ "id" : " Qooecm9VTeus"
307
+ }
308
+ }
309
+ ],
310
+ "metadata" : {
311
+ "kernelspec" : {
312
+ "display_name" : " Python 3 (ipykernel)" ,
313
+ "language" : " python" ,
314
+ "name" : " python3"
315
+ },
316
+ "language_info" : {
317
+ "codemirror_mode" : {
318
+ "name" : " ipython" ,
319
+ "version" : 3
320
+ },
321
+ "file_extension" : " .py" ,
322
+ "mimetype" : " text/x-python" ,
323
+ "name" : " python" ,
324
+ "nbconvert_exporter" : " python" ,
325
+ "pygments_lexer" : " ipython3" ,
326
+ "version" : " 3.10.12"
327
+ },
328
+ "colab" : {
329
+ "provenance" : []
330
+ }
331
+ },
332
+ "nbformat" : 4 ,
333
+ "nbformat_minor" : 0
334
+ }
0 commit comments