Skip to content

Commit 1429482

Browse files
authored
[SPARKNLP-1093] Adding support to read Email files (#14455)
* [SPARKNLP-1089] Adding support to read HTML files * [SPARKNLP-1089] Adding documentation and support for set of URLs in python * [SPARKNLP-1089] Adding input validation in python * [SPARKNLP-1093] Adding support to read email files * [SPARKNLP-1093] Adding reading email notebook example
1 parent dfae801 commit 1429482

23 files changed

+33559
-5
lines changed

build.sbt

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -156,7 +156,10 @@ lazy val utilDependencies = Seq(
156156
exclude ("com.fasterxml.jackson.dataformat", "jackson-dataformat-cbor"),
157157
greex,
158158
azureIdentity,
159-
azureStorage)
159+
azureStorage,
160+
jsoup,
161+
jakartaMail
162+
)
160163

161164
lazy val typedDependencyParserDependencies = Seq(junit)
162165

@@ -229,6 +232,7 @@ lazy val root = (project in file("."))
229232

230233
(assembly / assemblyMergeStrategy) := {
231234
case PathList("META-INF", "versions", "9", "module-info.class") => MergeStrategy.discard
235+
case PathList("module-info.class") => MergeStrategy.discard // Discard any module-info.class globally
232236
case PathList("apache.commons.lang3", _ @_*) => MergeStrategy.discard
233237
case PathList("org.apache.hadoop", _ @_*) => MergeStrategy.first
234238
case PathList("com.amazonaws", _ @_*) => MergeStrategy.last
Lines changed: 334 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,334 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "markdown",
5+
"metadata": {
6+
"id": "tzcU5p2gdak9"
7+
},
8+
"source": [
9+
"# Introducing Email reader in SparkNLP\n",
10+
"This notebook showcases the newly added `sparknlp.read().email()` method in Spark NLP that parses email content from both local file system and distributed file systems into a Spark DataFrame."
11+
]
12+
},
13+
{
14+
"cell_type": "code",
15+
"source": [
16+
"from google.colab import drive\n",
17+
"drive.mount('/content/drive')"
18+
],
19+
"metadata": {
20+
"colab": {
21+
"base_uri": "https://localhost:8080/"
22+
},
23+
"id": "xrvHhiTAdfGd",
24+
"outputId": "07fb7294-33b3-4af0-f4ac-d87e43fd21b6"
25+
},
26+
"execution_count": 1,
27+
"outputs": [
28+
{
29+
"output_type": "stream",
30+
"name": "stdout",
31+
"text": [
32+
"Mounted at /content/drive\n"
33+
]
34+
}
35+
]
36+
},
37+
{
38+
"cell_type": "code",
39+
"source": [
40+
"!cp drive/MyDrive/JSL/sparknlp/sparknlp.jar .\n",
41+
"!cp drive/MyDrive/JSL/sparknlp/spark_nlp-5.5.1-py2.py3-none-any.whl ."
42+
],
43+
"metadata": {
44+
"id": "mjV3NcQ8eA52"
45+
},
46+
"execution_count": 8,
47+
"outputs": []
48+
},
49+
{
50+
"cell_type": "code",
51+
"source": [
52+
"!pip install pyspark"
53+
],
54+
"metadata": {
55+
"colab": {
56+
"base_uri": "https://localhost:8080/"
57+
},
58+
"id": "pEmutNjReCgc",
59+
"outputId": "32610063-174f-432b-be4a-6ab2ae9dd709"
60+
},
61+
"execution_count": 3,
62+
"outputs": [
63+
{
64+
"output_type": "stream",
65+
"name": "stdout",
66+
"text": [
67+
"Requirement already satisfied: pyspark in /usr/local/lib/python3.10/dist-packages (3.5.3)\n",
68+
"Requirement already satisfied: py4j==0.10.9.7 in /usr/local/lib/python3.10/dist-packages (from pyspark) (0.10.9.7)\n"
69+
]
70+
}
71+
]
72+
},
73+
{
74+
"cell_type": "code",
75+
"source": [
76+
"!pip install spark_nlp-5.5.1-py2.py3-none-any.whl"
77+
],
78+
"metadata": {
79+
"colab": {
80+
"base_uri": "https://localhost:8080/"
81+
},
82+
"id": "3qjPeDjvfCpA",
83+
"outputId": "620c793f-5cb1-4a82-f687-53f3be348d9c"
84+
},
85+
"execution_count": 9,
86+
"outputs": [
87+
{
88+
"output_type": "stream",
89+
"name": "stdout",
90+
"text": [
91+
"Processing ./spark_nlp-5.5.1-py2.py3-none-any.whl\n",
92+
"Installing collected packages: spark-nlp\n",
93+
"Successfully installed spark-nlp-5.5.1\n"
94+
]
95+
}
96+
]
97+
},
98+
{
99+
"cell_type": "code",
100+
"source": [
101+
"# import sparknlp\n",
102+
"# # let's start Spark with Spark NLP\n",
103+
"# spark = sparknlp.start()\n",
104+
"\n",
105+
"from pyspark.sql import SparkSession\n",
106+
"\n",
107+
"spark = SparkSession.builder \\\n",
108+
" .appName(\"SparkNLP\") \\\n",
109+
" .master(\"local[*]\") \\\n",
110+
" .config(\"spark.driver.memory\", \"12G\") \\\n",
111+
" .config(\"spark.serializer\", \"org.apache.spark.serializer.KryoSerializer\") \\\n",
112+
" .config(\"spark.kryoserializer.buffer.max\", \"2000M\") \\\n",
113+
" .config(\"spark.driver.maxResultSize\", \"0\") \\\n",
114+
" .config(\"spark.jars\", \"./sparknlp.jar\") \\\n",
115+
" .getOrCreate()\n",
116+
"\n",
117+
"\n",
118+
"print(\"Apache Spark version: {}\".format(spark.version))"
119+
],
120+
"metadata": {
121+
"colab": {
122+
"base_uri": "https://localhost:8080/"
123+
},
124+
"id": "DczWop6QeE8F",
125+
"outputId": "714b032f-e076-4aa3-8cf2-10eea6993c4d"
126+
},
127+
"execution_count": 5,
128+
"outputs": [
129+
{
130+
"output_type": "stream",
131+
"name": "stdout",
132+
"text": [
133+
"Apache Spark version: 3.5.3\n"
134+
]
135+
}
136+
]
137+
},
138+
{
139+
"cell_type": "markdown",
140+
"metadata": {
141+
"id": "RFOFhaEedalB"
142+
},
143+
"source": [
144+
"## Setup and Initialization\n",
145+
"Let's keep in mind a few things before we start 😊\n",
146+
"\n",
147+
"Support for reading email files was introduced in Spark NLP 5.5.2. Please make sure you have upgraded to the latest Spark NLP release.\n",
148+
"\n",
149+
"For local files example we will download a couple of email files from Spark NLP Github repo:"
150+
]
151+
},
152+
{
153+
"cell_type": "code",
154+
"execution_count": 17,
155+
"metadata": {
156+
"colab": {
157+
"base_uri": "https://localhost:8080/"
158+
},
159+
"id": "ya8qZe00dalC",
160+
"outputId": "a9916407-f76d-4c59-fdad-ea17ca0a4326"
161+
},
162+
"outputs": [
163+
{
164+
"output_type": "stream",
165+
"name": "stdout",
166+
"text": [
167+
"mkdir: cannot create directory ‘email-files’: File exists\n",
168+
"--2024-11-13 21:01:15-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/feature/SPARKNLP-1093-Adding-support-to-read-Email-files/src/test/resources/reader/email/email-text-attachments.eml\n",
169+
"Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n",
170+
"Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n",
171+
"HTTP request sent, awaiting response... 200 OK\n",
172+
"Length: 3175 (3.1K) [text/plain]\n",
173+
"Saving to: ‘email-files/email-text-attachments.eml’\n",
174+
"\n",
175+
"email-text-attachme 100%[===================>] 3.10K --.-KB/s in 0s \n",
176+
"\n",
177+
"2024-11-13 21:01:15 (29.9 MB/s) - ‘email-files/email-text-attachments.eml’ saved [3175/3175]\n",
178+
"\n",
179+
"--2024-11-13 21:01:15-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/feature/SPARKNLP-1093-Adding-support-to-read-Email-files/src/test/resources/reader/email/test-several-attachments.eml\n",
180+
"Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n",
181+
"Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n",
182+
"HTTP request sent, awaiting response... 200 OK\n",
183+
"Length: 1324361 (1.3M) [text/plain]\n",
184+
"Saving to: ‘email-files/test-several-attachments.eml’\n",
185+
"\n",
186+
"test-several-attach 100%[===================>] 1.26M --.-KB/s in 0.05s \n",
187+
"\n",
188+
"2024-11-13 21:01:16 (26.7 MB/s) - ‘email-files/test-several-attachments.eml’ saved [1324361/1324361]\n",
189+
"\n"
190+
]
191+
}
192+
],
193+
"source": [
194+
"!mkdir email-files\n",
195+
"!wget https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/feature/SPARKNLP-1093-Adding-support-to-read-Email-files/src/test/resources/reader/email/email-text-attachments.eml -P email-files\n",
196+
"!wget https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/feature/SPARKNLP-1093-Adding-support-to-read-Email-files/src/test/resources/reader/email/test-several-attachments.eml -P email-files"
197+
]
198+
},
199+
{
200+
"cell_type": "code",
201+
"source": [
202+
"!ls -lh ./email-files"
203+
],
204+
"metadata": {
205+
"colab": {
206+
"base_uri": "https://localhost:8080/"
207+
},
208+
"id": "3xgGItNbU2DZ",
209+
"outputId": "12f8a7be-f9b4-49ce-a9ab-222142f28293"
210+
},
211+
"execution_count": 18,
212+
"outputs": [
213+
{
214+
"output_type": "stream",
215+
"name": "stdout",
216+
"text": [
217+
"total 1.3M\n",
218+
"-rw-r--r-- 1 root root 3.2K Nov 13 21:01 email-text-attachments.eml\n",
219+
"-rw-r--r-- 1 root root 1.3M Nov 13 21:01 test-several-attachments.eml\n"
220+
]
221+
}
222+
]
223+
},
224+
{
225+
"cell_type": "markdown",
226+
"metadata": {
227+
"id": "EoFI66NAdalE"
228+
},
229+
"source": [
230+
"## Parsing Email from Local Files\n",
231+
"Use the `email()` method to parse email content from local directories."
232+
]
233+
},
234+
{
235+
"cell_type": "code",
236+
"execution_count": 22,
237+
"metadata": {
238+
"colab": {
239+
"base_uri": "https://localhost:8080/"
240+
},
241+
"id": "bAkMjJ1vdalE",
242+
"outputId": "4b360b6c-5049-4f10-bb52-60e0e0e52e52"
243+
},
244+
"outputs": [
245+
{
246+
"output_type": "stream",
247+
"name": "stdout",
248+
"text": [
249+
"Warning::Spark Session already created, some configs may not take.\n",
250+
"+--------------------+\n",
251+
"| email|\n",
252+
"+--------------------+\n",
253+
"|[{Title, Email Te...|\n",
254+
"|[{Title, Test Sev...|\n",
255+
"+--------------------+\n",
256+
"\n"
257+
]
258+
}
259+
],
260+
"source": [
261+
"import sparknlp\n",
262+
"email_df = sparknlp.read().email(\"./email-files\")\n",
263+
"\n",
264+
"email_df.select(\"email\").show()"
265+
]
266+
},
267+
{
268+
"cell_type": "code",
269+
"source": [
270+
"email_df.printSchema()"
271+
],
272+
"metadata": {
273+
"colab": {
274+
"base_uri": "https://localhost:8080/"
275+
},
276+
"id": "7CMPPubFTeHj",
277+
"outputId": "48ee68cf-0f7f-408a-a855-2fd2eb2e8bd1"
278+
},
279+
"execution_count": 21,
280+
"outputs": [
281+
{
282+
"output_type": "stream",
283+
"name": "stdout",
284+
"text": [
285+
"root\n",
286+
" |-- path: string (nullable = true)\n",
287+
" |-- content: binary (nullable = true)\n",
288+
" |-- email: array (nullable = true)\n",
289+
" | |-- element: struct (containsNull = true)\n",
290+
" | | |-- elementType: string (nullable = true)\n",
291+
" | | |-- content: string (nullable = true)\n",
292+
" | | |-- metadata: map (nullable = true)\n",
293+
" | | | |-- key: string\n",
294+
" | | | |-- value: string (valueContainsNull = true)\n",
295+
"\n"
296+
]
297+
}
298+
]
299+
},
300+
{
301+
"cell_type": "markdown",
302+
"source": [
303+
"You can also use DFS like Databricks `dbfs://` or HDFS directories `hdfs://`"
304+
],
305+
"metadata": {
306+
"id": "Qooecm9VTeus"
307+
}
308+
}
309+
],
310+
"metadata": {
311+
"kernelspec": {
312+
"display_name": "Python 3 (ipykernel)",
313+
"language": "python",
314+
"name": "python3"
315+
},
316+
"language_info": {
317+
"codemirror_mode": {
318+
"name": "ipython",
319+
"version": 3
320+
},
321+
"file_extension": ".py",
322+
"mimetype": "text/x-python",
323+
"name": "python",
324+
"nbconvert_exporter": "python",
325+
"pygments_lexer": "ipython3",
326+
"version": "3.10.12"
327+
},
328+
"colab": {
329+
"provenance": []
330+
}
331+
},
332+
"nbformat": 4,
333+
"nbformat_minor": 0
334+
}

0 commit comments

Comments
 (0)