From 7adf658f1e407d847a6e11bf4402c05470c1e835 Mon Sep 17 00:00:00 2001 From: Danilo Burbano Date: Wed, 2 Oct 2024 17:30:14 -0500 Subject: [PATCH 01/24] [SPARKNLP-1068] Introducing BLIPForQuestionAnswering transformer --- python/sparknlp/annotator/cv/__init__.py | 1 + .../cv/blip_for_question_answering.py | 107 +++++++ python/sparknlp/base/image_assembler.py | 11 + python/sparknlp/base/light_pipeline.py | 29 +- python/sparknlp/internal/__init__.py | 8 + .../cv/blip_for_question_answering_test.py | 80 +++++ .../johnsnowlabs/ml/ai/BLIPClassifier.scala | 215 +++++++++++++ .../johnsnowlabs/nlp/AnnotationImage.scala | 24 +- .../nlp/HasBatchedAnnotateImage.scala | 3 +- .../com/johnsnowlabs/nlp/ImageAssembler.scala | 40 ++- .../com/johnsnowlabs/nlp/LightPipeline.scala | 83 +++-- .../cv/BLIPForQuestionAnswering.scala | 301 ++++++++++++++++++ .../tokenizer/bpe/BertTokenizer.scala | 81 +++++ .../tokenizer/bpe/BpeSpecialTokens.scala | 8 + .../nlp/pretrained/PretrainedPipeline.scala | 11 +- .../johnsnowlabs/nlp/AssertAnnotations.scala | 9 +- .../johnsnowlabs/nlp/ImageAssemblerTest.scala | 29 +- .../cv/BLIPForQuestionAnsweringTest.scala | 186 +++++++++++ ...LIPForZeroShotClassificationTestSpec.scala | 2 +- .../cv/ViTImageClassificationTestSpec.scala | 6 +- ...derDecoderForImageCaptioningTestSpec.scala | 2 +- 21 files changed, 1175 insertions(+), 61 deletions(-) create mode 100644 python/sparknlp/annotator/cv/blip_for_question_answering.py create mode 100644 python/test/annotator/cv/blip_for_question_answering_test.py create mode 100644 src/main/scala/com/johnsnowlabs/ml/ai/BLIPClassifier.scala create mode 100644 src/main/scala/com/johnsnowlabs/nlp/annotators/cv/BLIPForQuestionAnswering.scala create mode 100644 src/main/scala/com/johnsnowlabs/nlp/annotators/tokenizer/bpe/BertTokenizer.scala create mode 100644 src/test/scala/com/johnsnowlabs/nlp/annotators/cv/BLIPForQuestionAnsweringTest.scala diff --git a/python/sparknlp/annotator/cv/__init__.py b/python/sparknlp/annotator/cv/__init__.py index 7c89437989600b..37eeaf696bb2a8 100644 --- a/python/sparknlp/annotator/cv/__init__.py +++ b/python/sparknlp/annotator/cv/__init__.py @@ -16,3 +16,4 @@ from sparknlp.annotator.cv.convnext_for_image_classification import * from sparknlp.annotator.cv.vision_encoder_decoder_for_image_captioning import * from sparknlp.annotator.cv.clip_for_zero_shot_classification import * +from sparknlp.annotator.cv.blip_for_question_answering import * \ No newline at end of file diff --git a/python/sparknlp/annotator/cv/blip_for_question_answering.py b/python/sparknlp/annotator/cv/blip_for_question_answering.py new file mode 100644 index 00000000000000..b861449e27d862 --- /dev/null +++ b/python/sparknlp/annotator/cv/blip_for_question_answering.py @@ -0,0 +1,107 @@ +# Copyright 2017-2024 John Snow Labs +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from sparknlp.common import * + +class BLIPForQuestionAnswering(AnnotatorModel, + HasBatchedAnnotateImage, + HasImageFeatureProperties, + HasEngine, + HasCandidateLabelsProperties, + HasRescaleFactor): + + name = "BLIPForQuestionAnswering" + + inputAnnotatorTypes = [AnnotatorType.IMAGE] + + outputAnnotatorType = AnnotatorType.DOCUMENT + + configProtoBytes = Param(Params._dummy(), + "configProtoBytes", + "ConfigProto from tensorflow, serialized into byte array. Get with " + "config_proto.SerializeToString()", + TypeConverters.toListInt) + + maxSentenceLength = Param(Params._dummy(), + "maxSentenceLength", + "Maximum sentence length that the annotator will process. Above this, the sentence is skipped", + typeConverter=TypeConverters.toInt) + + def setMaxSentenceSize(self, value): + """Sets Maximum sentence length that the annotator will process, by + default 50. + + Parameters + ---------- + value : int + Maximum sentence length that the annotator will process + """ + return self._set(maxSentenceLength=value) + + + @keyword_only + def __init__(self, classname="com.johnsnowlabs.nlp.annotators.cv.BLIPForQuestionAnswering", + java_model=None): + super(BLIPForQuestionAnswering, self).__init__( + classname=classname, + java_model=java_model + ) + self._setDefault( + batchSize=2, + size=224, + maxSentenceLength=50 + ) + + @staticmethod + def loadSavedModel(folder, spark_session): + """Loads a locally saved model. + + Parameters + ---------- + folder : str + Folder of the saved model + spark_session : pyspark.sql.SparkSession + The current SparkSession + + Returns + ------- + CLIPForZeroShotClassification + The restored model + """ + from sparknlp.internal import _BLIPForQuestionAnswering + jModel = _BLIPForQuestionAnswering(folder, spark_session._jsparkSession)._java_obj + return BLIPForQuestionAnswering(java_model=jModel) + + @staticmethod + def pretrained(name="blip_vqa_tf", lang="en", remote_loc=None): + """Downloads and loads a pretrained model. + + Parameters + ---------- + name : str, optional + Name of the pretrained model, by default + "blip_vqa_tf" + lang : str, optional + Language of the pretrained model, by default "en" + remote_loc : str, optional + Optional remote address of the resource, by default None. Will use + Spark NLPs repositories otherwise. + + Returns + ------- + CLIPForZeroShotClassification + The restored model + """ + from sparknlp.pretrained import ResourceDownloader + return ResourceDownloader.downloadModel(BLIPForQuestionAnswering, name, lang, remote_loc) \ No newline at end of file diff --git a/python/sparknlp/base/image_assembler.py b/python/sparknlp/base/image_assembler.py index 3214ff37324172..cc8a9eb8c91253 100644 --- a/python/sparknlp/base/image_assembler.py +++ b/python/sparknlp/base/image_assembler.py @@ -65,6 +65,7 @@ class ImageAssembler(AnnotatorTransformer): outputAnnotatorType = AnnotatorType.IMAGE inputCol = Param(Params._dummy(), "inputCol", "input column name", typeConverter=TypeConverters.toString) + textCol = Param(Params._dummy(), "textCol", "text column name", typeConverter=TypeConverters.toString) outputCol = Param(Params._dummy(), "outputCol", "output column name", typeConverter=TypeConverters.toString) name = 'ImageAssembler' @@ -101,3 +102,13 @@ def setOutputCol(self, value): def getOutputCol(self): """Gets output column name of annotations.""" return self.getOrDefault(self.outputCol) + + def setTextCol(self, value): + """Sets an optional text column name. + + Parameters + ---------- + value : str + Name of an optional input text column + """ + return self._set(inputCol=value) diff --git a/python/sparknlp/base/light_pipeline.py b/python/sparknlp/base/light_pipeline.py index 0622652fc01a42..4dd4f9128622ad 100644 --- a/python/sparknlp/base/light_pipeline.py +++ b/python/sparknlp/base/light_pipeline.py @@ -277,7 +277,7 @@ def __fullAnnotateQuestionAnswering(self, question, context): return result - def fullAnnotateImage(self, path_to_image): + def fullAnnotateImage(self, path_to_image, text=None): """Annotates the data provided into `Annotation` type results. The data should be either a list or a str. @@ -287,27 +287,38 @@ def fullAnnotateImage(self, path_to_image): path_to_image : list or str Source path of image, list of paths to images + text: list or str, optional + Optional list or str of texts. If None, defaults to empty list if path_to_image is a list, or empty string if path_to_image is a string. + Returns ------- List[AnnotationImage] The result of the annotation """ + if not isinstance(path_to_image, (str, list)): + raise TypeError("argument for path_to_image must be 'str' or 'list[str]'") + + if text is None: + text = "" if isinstance(path_to_image, str) else [] + + if type(path_to_image) != type(text): + raise ValueError("`path_to_image` and `text` must be of the same type") + stages = self.pipeline_model.stages if not self._skipPipelineValidation(stages): self._validateStagesInputCols(stages) - if type(path_to_image) is str: + if isinstance(path_to_image, str): path_to_image = [path_to_image] + text = [text] - if type(path_to_image) is list: - result = [] + result = [] - for image_result in self._lightPipeline.fullAnnotateImageJava(path_to_image): - result.append(self.__buildStages(image_result)) + for image_result in self._lightPipeline.fullAnnotateImageJava(path_to_image, text): + result.append(self.__buildStages(image_result)) + + return result - return result - else: - raise TypeError("argument for annotation may be 'str' or list[str]") def __buildStages(self, annotations_result): stages = {} diff --git a/python/sparknlp/internal/__init__.py b/python/sparknlp/internal/__init__.py index c8732ef3ecb4e5..0386e5201968e4 100644 --- a/python/sparknlp/internal/__init__.py +++ b/python/sparknlp/internal/__init__.py @@ -999,3 +999,11 @@ def __init__(self, path, jspark): super(_SnowFlakeEmbeddingsLoader, self).__init__( "com.johnsnowlabs.nlp.embeddings.SnowFlakeEmbeddings.loadSavedModel", path, jspark ) + +class _BLIPForQuestionAnswering(ExtendedJavaWrapper): + def __init__(self, path, jspark): + super(_BLIPForQuestionAnswering, self).__init__( + "com.johnsnowlabs.nlp.annotators.cv.BLIPForQuestionAnswering.loadSavedModel", + path, + jspark, + ) \ No newline at end of file diff --git a/python/test/annotator/cv/blip_for_question_answering_test.py b/python/test/annotator/cv/blip_for_question_answering_test.py new file mode 100644 index 00000000000000..8eb0dbae3e70ae --- /dev/null +++ b/python/test/annotator/cv/blip_for_question_answering_test.py @@ -0,0 +1,80 @@ +# Copyright 2017-2024 John Snow Labs +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import unittest +import pytest +import os + +from sparknlp.annotator import * +from sparknlp.base import * +from pyspark.sql.functions import lit +from test.util import SparkSessionForTest + + +class BLIPForQuestionAnsweringTestSetup(unittest.TestCase): + + def setUp(self): + self.images_path = os.getcwd() + "/../src/test/resources/image/" + image_df = SparkSessionForTest.spark.read.format("image").load( + path=self.images_path + ) + + self.test_df = image_df.withColumn("text", lit("What's this picture about?")) + + image_assembler = ImageAssembler().setInputCol("image").setOutputCol("image_assembler") + + imageClassifier = BLIPForQuestionAnswering.pretrained() \ + .setInputCols("image_assembler") \ + .setOutputCol("answer") \ + .setSize(384) + + self.pipeline = Pipeline( + stages=[ + image_assembler, + imageClassifier, + ] + ) + + self.model = self.pipeline.fit(self.test_df) + +@pytest.mark.slow +class BLIPForQuestionAnsweringTest(BLIPForQuestionAnsweringTestSetup, unittest.TestCase): + + def setUp(self): + super().setUp() + + def runTest(self): + result = self.model.transform(self.test_df).collect() + + for row in result: + self.assertTrue(row["answer"] != "") + + +@pytest.mark.slow +class LightBLIPForQuestionAnsweringTest(BLIPForQuestionAnsweringTestSetup, unittest.TestCase): + + def setUp(self): + super().setUp() + + def runTest(self): + light_pipeline = LightPipeline(self.model) + image_path = self.images_path + "bluetick.jpg" + print("image_path: " + image_path) + annotations_result = light_pipeline.fullAnnotateImage( + image_path, + "What's this picture about?" + ) + + for result in annotations_result: + self.assertTrue(len(result["image_assembler"]) > 0) + self.assertTrue(len(result["answer"]) > 0) \ No newline at end of file diff --git a/src/main/scala/com/johnsnowlabs/ml/ai/BLIPClassifier.scala b/src/main/scala/com/johnsnowlabs/ml/ai/BLIPClassifier.scala new file mode 100644 index 00000000000000..3182d6dd0fdf92 --- /dev/null +++ b/src/main/scala/com/johnsnowlabs/ml/ai/BLIPClassifier.scala @@ -0,0 +1,215 @@ +/* + * Copyright 2017-2024 John Snow Labs + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.johnsnowlabs.ml.ai + +import com.johnsnowlabs.ml.tensorflow.sign.{ModelSignatureConstants, ModelSignatureManager} +import com.johnsnowlabs.ml.tensorflow.{TensorResources, TensorflowWrapper} +import com.johnsnowlabs.nlp.annotators.common._ +import com.johnsnowlabs.nlp.annotators.cv.feature_extractor.Preprocessor +import com.johnsnowlabs.nlp.annotators.cv.util.io.ImageIOUtils +import com.johnsnowlabs.nlp.annotators.cv.util.transform.ImageResizeUtils +import com.johnsnowlabs.nlp.annotators.tokenizer.bpe.BertTokenizer +import com.johnsnowlabs.nlp.annotators.tokenizer.wordpiece.WordpieceEncoder +import com.johnsnowlabs.nlp.{Annotation, AnnotationImage} +import org.tensorflow.ndarray.buffer.{IntDataBuffer, LongDataBuffer} + +import scala.collection.JavaConverters._ + +private[johnsnowlabs] class BLIPClassifier( + val tensorflowWrapper: TensorflowWrapper, + configProtoBytes: Option[Array[Byte]] = None, + tokenizer: BertTokenizer, + preprocessor: Preprocessor, + signatures: Option[Map[String, String]] = None, + vocabulary: Map[String, Int]) + extends Serializable { + + private val _tfBLIPSignatures: Map[String, String] = + signatures.getOrElse(ModelSignatureManager.apply()) + + def predict( + images: Array[AnnotationImage], + questions: Seq[Annotation], + maxSentenceLength: Int, + batchSize: Int): Seq[Annotation] = { + + val sentences = SentenceSplit.unpack(questions).toArray + val tokenizedSentences = TokenizedWithSentence.unpack(questions).toArray + val inputIds = encodeTokenizedSentence( + tokenizedSentences, + sentences, + batchSize, + maxSentenceLength, + caseSensitive = false) + + val pixelValues = images + .grouped(batchSize) + .flatMap { batch => + encodeImage(batch, preprocessor) + } + .toArray + + val outputs = generate(pixelValues, inputIds, maxSentenceLength) + val decodedOutput = tokenizer.decodeTokens(outputs) + Seq(Annotation(decodedOutput)) + } + + def generate( + imagesBatch: Array[Array[Array[Array[Float]]]], + inputsBatch: Array[Array[Int]], + maxSentenceLength: Int): Array[Int] = { + val tensors = new TensorResources() + val imageTensors = tensors.createTensor(imagesBatch) + + val batchLength = inputsBatch.length + // [nb of encoded sentences , maxSentenceLength] + val shape = Array(imagesBatch.length.toLong, maxSentenceLength) + + val tokenBuffers: IntDataBuffer = tensors.createIntBuffer(batchLength * maxSentenceLength) + val maskBuffers: LongDataBuffer = tensors.createLongBuffer(batchLength * maxSentenceLength) + + inputsBatch.zipWithIndex + .foreach { case (sentence, idx) => + val offset = idx * maxSentenceLength + tokenBuffers.offset(offset).write(sentence) + maskBuffers.offset(offset).write(sentence.map(x => if (x == 0) 0L else 1L)) + } + + val tokenTensors = tensors.createIntBufferTensor(shape, tokenBuffers) + val maskTensors = tensors.createLongBufferTensor(shape, maskBuffers) + + val runner = tensorflowWrapper + .getTFSessionWithSignature(configProtoBytes = configProtoBytes, initAllTables = false) + .runner + + runner + .feed( + _tfBLIPSignatures + .getOrElse(ModelSignatureConstants.InputIds.key, "missing_input_ids"), + tokenTensors) + .feed( + _tfBLIPSignatures + .getOrElse(ModelSignatureConstants.AttentionMask.key, "missing_input_mask_key"), + maskTensors) + .feed( + _tfBLIPSignatures + .getOrElse(ModelSignatureConstants.PixelValuesInput.key, "missing_pixel_values"), + imageTensors) + .fetch(_tfBLIPSignatures + .getOrElse(ModelSignatureConstants.DecoderOutput.key, "missing_output")) + + val outs = runner.run().asScala + val output = TensorResources.extractInts(outs.head) + + tensors.clearSession(outs) + tensors.clearTensors() + imageTensors.close() + + output + } + + /** Calculate softmax from returned logits + * @param scores + * logits output from output layer + * @return + */ + def calculateSoftmax(scores: Array[Float]): Array[Float] = { + val exp = scores.map(x => math.exp(x)) + exp.map(x => x / exp.sum).map(_.toFloat) + } + + private def encodeImage( + annotations: Array[AnnotationImage], + preprocessor: Preprocessor): Array[Array[Array[Array[Float]]]] = { + + val batchProcessedImages = annotations.map { annot => + val bufferedImage = ImageIOUtils.byteToBufferedImage( + bytes = annot.result, + w = annot.width, + h = annot.height, + nChannels = annot.nChannels) + + val resizedImage = if (preprocessor.do_resize) { + ImageResizeUtils.resizeBufferedImage( + width = preprocessor.size, + height = preprocessor.size, + preprocessor.resample)(bufferedImage) + } else bufferedImage + + val normalizedImage = + ImageResizeUtils.normalizeAndConvertBufferedImage( + img = resizedImage, + mean = preprocessor.image_mean, + std = preprocessor.image_std, + doNormalize = preprocessor.do_normalize, + doRescale = preprocessor.do_rescale, + rescaleFactor = preprocessor.rescale_factor) + + normalizedImage + } + + batchProcessedImages + + } + + def encodeTokenizedSentence( + tokenizedSentences: Seq[TokenizedSentence], + sentences: Seq[Sentence], + batchSize: Int, + maxSentenceLength: Int, + caseSensitive: Boolean): Array[Array[Int]] = { + val wordPieceTokenizedSentences = + tokenizeWithAlignment(tokenizedSentences, maxSentenceLength, caseSensitive) + + /*Run calculation by batches*/ + wordPieceTokenizedSentences + .zip(sentences) + .zipWithIndex + .grouped(batchSize) + .flatMap { batch => + val tokensBatch = batch.map(x => (x._1._1, x._2)) + tokenizer.encode(tokensBatch, maxSentenceLength) + } + .toArray + } + + def tokenizeWithAlignment( + sentences: Seq[TokenizedSentence], + maxSeqLength: Int, + caseSensitive: Boolean): Seq[WordpieceTokenizedSentence] = { + + val encoder = new WordpieceEncoder(vocabulary) + + sentences.map { tokenIndex => + // filter empty and only whitespace tokens + val bertTokens = + tokenIndex.indexedTokens.filter(x => x.token.nonEmpty && !x.token.equals(" ")).map { + token => + val content = if (caseSensitive) token.token else token.token.toLowerCase() + val sentenceBegin = token.begin + val sentenceEnd = token.end + val sentenceIndex = tokenIndex.sentenceIndex + val result = + tokenizer.tokenize(Sentence(content, sentenceBegin, sentenceEnd, sentenceIndex)) + if (result.nonEmpty) result.head else IndexedToken("") + } + val wordpieceTokens = bertTokens.flatMap(token => encoder.encode(token)).take(maxSeqLength) + WordpieceTokenizedSentence(wordpieceTokens) + } + } + +} diff --git a/src/main/scala/com/johnsnowlabs/nlp/AnnotationImage.scala b/src/main/scala/com/johnsnowlabs/nlp/AnnotationImage.scala index 72ef1c6d73a123..b566c3c5ccb7ea 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/AnnotationImage.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/AnnotationImage.scala @@ -48,7 +48,8 @@ case class AnnotationImage( nChannels: Int, mode: Int, result: Array[Byte], - metadata: Map[String, String]) + metadata: Map[String, String], + text: String = "") extends IAnnotation { override def equals(obj: Any): Boolean = { @@ -61,7 +62,8 @@ case class AnnotationImage( this.nChannels == annotation.nChannels && this.mode == annotation.mode && this.result.sameElements(annotation.result) && - this.metadata == annotation.metadata + this.metadata == annotation.metadata && + this.text == annotation.text case _ => false } } @@ -94,6 +96,10 @@ case class AnnotationImage( metadata } + def getText: String = { + text + } + } object AnnotationImage { @@ -112,7 +118,8 @@ object AnnotationImage { StructField("mode", IntegerType, nullable = false), // Bytes in OpenCV-compatible order: row-wise BGR in most cases StructField("result", BinaryType, nullable = false), - StructField("metadata", MapType(StringType, StringType), nullable = true))) + StructField("metadata", MapType(StringType, StringType), nullable = true), + StructField("text", StringType, nullable = true))) val arrayType = new ArrayType(dataType, true) @@ -122,7 +129,8 @@ object AnnotationImage { width: Int, nChannels: Int, mode: Int, - result: Array[Byte]) + result: Array[Byte], + text: String) /** This method converts a [[org.apache.spark.sql.Row]] into an [[AnnotationImage]] * @@ -132,6 +140,7 @@ object AnnotationImage { * AnnotationImage */ def apply(row: Row): AnnotationImage = { + println(s"row.getString(8): ${row.getString(8)}") AnnotationImage( row.getString(0), row.getString(1), @@ -140,7 +149,8 @@ object AnnotationImage { row.getInt(4), row.getInt(5), row.getAs[Array[Byte]](6), - row.getMap[String, String](7)) + row.getMap[String, String](7), + row.getString(8)) } def apply(image: ImageFields): AnnotationImage = @@ -152,6 +162,6 @@ object AnnotationImage { nChannels = image.nChannels, mode = image.mode, result = Array.emptyByteArray, - Map.empty[String, String]) - + metadata = Map.empty[String, String], + text = image.text) } diff --git a/src/main/scala/com/johnsnowlabs/nlp/HasBatchedAnnotateImage.scala b/src/main/scala/com/johnsnowlabs/nlp/HasBatchedAnnotateImage.scala index ded31e5e59cb51..d105c879143fbb 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/HasBatchedAnnotateImage.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/HasBatchedAnnotateImage.scala @@ -65,7 +65,8 @@ trait HasBatchedAnnotateImage[M <: Model[M]] { r.getInt(4), r.getInt(5), r.getAs(6), - r.getMap[String, String](7))) + r.getMap[String, String](7), + r.getString(8))) }) }) val outputAnnotations = batchAnnotate(inputAnnotations) diff --git a/src/main/scala/com/johnsnowlabs/nlp/ImageAssembler.scala b/src/main/scala/com/johnsnowlabs/nlp/ImageAssembler.scala index 3ef7ccd67d9803..73b08bae40d695 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/ImageAssembler.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/ImageAssembler.scala @@ -110,7 +110,26 @@ class ImageAssembler(override val uid: String) */ def getInputCol: String = $(inputCol) - setDefault(inputCol -> IMAGE, outputCol -> "image_assembler") + /** Input text column for processing + * + * @group param + */ + val textCol: Param[String] = + new Param[String](this, "textCol", "input text column for processing") + + /** Input text column for processing + * + * @group setParam + */ + def setTextCol(value: String): this.type = set(textCol, value) + + /** Input text column for processing + * + * @group getParam + */ + def getTextCol: String = $(textCol) + + setDefault(inputCol -> IMAGE, outputCol -> "image_assembler", textCol -> "text") def this() = this(Identifiable.randomUID("ImageAssembler")) @@ -118,7 +137,8 @@ class ImageAssembler(override val uid: String) private[nlp] def assemble( image: Option[ImageFields], - metadata: Map[String, String]): Seq[AnnotationImage] = { + metadata: Map[String, String], + text: Option[String] = None): Seq[AnnotationImage] = { if (image.isDefined) { Seq( @@ -130,14 +150,21 @@ class ImageAssembler(override val uid: String) nChannels = image.get.nChannels, mode = image.get.mode, result = image.get.data, - metadata = metadata)) + metadata = metadata, + text = text.getOrElse(""))) } else Seq.empty } private[nlp] def dfAssemble: UserDefinedFunction = udf { (image: ImageFields) => // Apache Spark has only 1 image per row - assemble(Some(image), Map("image" -> "0")) + assemble(Some(image), Map("image" -> "0"), None) + } + + private[nlp] def dfAssembleWithText: UserDefinedFunction = udf { + (image: ImageFields, text: String) => + // Apache Spark has only 1 image per row + assemble(Some(image), Map("image" -> "0"), Some(text)) } /** requirement for pipeline transformation validation. It is called on fit() */ @@ -163,7 +190,10 @@ class ImageAssembler(override val uid: String) ImageSchemaUtils.isImage(dataset.schema(getInputCol)), s"column $getInputCol doesn't have Apache Spark ImageSchema. Make sure you read your images via spark.read.format(image).load(PATH)") - val imageAnnotations = { + val textColExists = dataset.schema.fields.exists(_.name == getTextCol) + val imageAnnotations = if (textColExists) { + dfAssembleWithText(dataset.col($(inputCol)), dataset.col($(textCol))) + } else { dfAssemble(dataset($(inputCol))) } diff --git a/src/main/scala/com/johnsnowlabs/nlp/LightPipeline.scala b/src/main/scala/com/johnsnowlabs/nlp/LightPipeline.scala index 2271bd945c64b5..d6793fdba19e8e 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/LightPipeline.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/LightPipeline.scala @@ -44,7 +44,7 @@ class LightPipeline(val pipelineModel: PipelineModel, parseEmbeddings: Boolean = def fullAnnotate(target: String, optionalTarget: String = ""): Map[String, Seq[IAnnotation]] = { if (target.contains("/") && ResourceHelper.validFile(target)) { - fullAnnotateImage(target) + fullAnnotateImage(target, optionalTarget) } else { fullAnnotateInternal(target, optionalTarget) } @@ -60,7 +60,7 @@ class LightPipeline(val pipelineModel: PipelineModel, parseEmbeddings: Boolean = } if (targets.head.contains("/") && ResourceHelper.validFile(targets.head)) { - targets.par.map(target => fullAnnotateImage(target)).toArray + fullAnnotateImages(targets, optionalTargets) } else { (targets zip optionalTargets).par.map { case (target, optionalTarget) => fullAnnotate(target, optionalTarget) @@ -68,14 +68,20 @@ class LightPipeline(val pipelineModel: PipelineModel, parseEmbeddings: Boolean = } } - def fullAnnotateImage(pathToImages: Array[String]): Array[Map[String, Seq[IAnnotation]]] = { - pathToImages.par - .map(imageFilePath => fullAnnotateInternal(imageFilePath)) - .toArray + def fullAnnotateImages( + pathToImages: Array[String], + texts: Array[String] = Array.empty): Array[Map[String, Seq[IAnnotation]]] = { + val safeTexts = if (texts.isEmpty) Array.fill(pathToImages.length)("") else texts + (pathToImages zip safeTexts).par.map { case (imageFilePath, text) => + fullAnnotateImage(imageFilePath, text) + }.toArray } - def fullAnnotateImage(pathToImage: String): Map[String, Seq[IAnnotation]] = { - fullAnnotateInternal(pathToImage) + def fullAnnotateImage(pathToImage: String, text: String = ""): Map[String, Seq[IAnnotation]] = { + val isValidFile = ResourceHelper.validFile(pathToImage) + if (!isValidFile || isValidFile && text.isEmpty) { + Map() + } else fullAnnotateInternal(pathToImage, text) } def fullAnnotate(audio: Array[Double]): Map[String, Seq[IAnnotation]] = { @@ -108,7 +114,7 @@ class LightPipeline(val pipelineModel: PipelineModel, parseEmbeddings: Boolean = optionalTarget, annotations) case imageAssembler: ImageAssembler => - processImageAssembler(target, imageAssembler, annotations) + processImageAssembler(target, optionalTarget, imageAssembler, annotations) case audioAssembler: AudioAssembler => processAudioAssembler(audio, audioAssembler, annotations) case lazyAnnotator: AnnotatorModel[_] if lazyAnnotator.getLazyAnnotator => annotations @@ -157,12 +163,13 @@ class LightPipeline(val pipelineModel: PipelineModel, parseEmbeddings: Boolean = private def processImageAssembler( target: String, + text: String, imageAssembler: ImageAssembler, annotations: Map[String, Seq[IAnnotation]]): Map[String, Seq[IAnnotation]] = { val currentImageFields = ImageIOUtils.imagePathToImageFields(target) annotations.updated( imageAssembler.getOutputCol, - imageAssembler.assemble(currentImageFields, Map.empty[String, String])) + imageAssembler.assemble(currentImageFields, Map.empty[String, String], Some(text))) } private def processAudioAssembler( @@ -209,9 +216,9 @@ class LightPipeline(val pipelineModel: PipelineModel, parseEmbeddings: Boolean = getCombinedAnnotations(batchedAnnotatorImage.getInputCols, annotations) val batchedAnnotations = Seq(combinedAnnotations.map(_.asInstanceOf[AnnotationImage])) - annotations.updated( - batchedAnnotatorImage.getOutputCol, - batchedAnnotatorImage.batchAnnotate(batchedAnnotations).head) + val outputCol = batchedAnnotatorImage.getOutputCol + val annotateResult = batchedAnnotatorImage.batchAnnotate(batchedAnnotations) + annotations.updated(outputCol, annotateResult.head) } private def processBatchedAnnotatorAudio( @@ -361,15 +368,35 @@ class LightPipeline(val pipelineModel: PipelineModel, parseEmbeddings: Boolean = fullAnnotateImage(pathToImage).mapValues(_.asJava).asJava } - def fullAnnotateImageJava(pathToImages: java.util.ArrayList[String]) + import scala.collection.JavaConverters._ + + def fullAnnotateImageJava( + pathToImages: java.util.ArrayList[String], + texts: java.util.ArrayList[String]) : java.util.List[java.util.Map[String, java.util.List[IAnnotation]]] = { - pathToImages.asScala.par - .map { imageFilePath => - fullAnnotateInternal(imageFilePath).mapValues(_.asJava).asJava + if (texts.isEmpty) { + pathToImages.asScala.par + .map { imageFilePath => + fullAnnotateInternal(imageFilePath).mapValues(_.asJava).asJava + } + .toList + .asJava + } else { + + if (pathToImages.size != texts.size) { + throw new IllegalArgumentException( + "pathToImages and texts must have the same number of elements.") } - .toList - .asJava + val imageTextPairs = pathToImages.asScala.zip(texts.asScala).par + + imageTextPairs + .map { case (imageFilePath, text) => + fullAnnotateImage(imageFilePath, text).mapValues(_.asJava).asJava + } + .toList + .asJava + } } def fullAnnotateSingleAudioJava( @@ -394,14 +421,16 @@ class LightPipeline(val pipelineModel: PipelineModel, parseEmbeddings: Boolean = } def annotate(target: String, optionalTarget: String = ""): Map[String, Seq[String]] = { - fullAnnotate(target, optionalTarget).mapValues(_.map { iAnnotation => - val annotation = iAnnotation.asInstanceOf[Annotation] - annotation.annotatorType match { - case AnnotatorType.WORD_EMBEDDINGS | AnnotatorType.SENTENCE_EMBEDDINGS - if parseEmbeddings => - annotation.embeddings.mkString(" ") - case _ => annotation.result - } + val annotations = fullAnnotate(target, optionalTarget) + annotations.mapValues(_.map { + case annotation: Annotation => + annotation.annotatorType match { + case AnnotatorType.WORD_EMBEDDINGS | AnnotatorType.SENTENCE_EMBEDDINGS + if parseEmbeddings => + annotation.embeddings.mkString(" ") + case _ => annotation.result + } + case _ => "" }) } diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/cv/BLIPForQuestionAnswering.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/cv/BLIPForQuestionAnswering.scala new file mode 100644 index 00000000000000..9cd5bca6ff9e35 --- /dev/null +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/cv/BLIPForQuestionAnswering.scala @@ -0,0 +1,301 @@ +/* + * Copyright 2017-2024 John Snow Labs + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.johnsnowlabs.nlp.annotators.cv + +import com.johnsnowlabs.ml.ai.BLIPClassifier +import com.johnsnowlabs.ml.tensorflow.{ + ReadTensorflowModel, + TensorflowWrapper, + WriteTensorflowModel +} +import com.johnsnowlabs.ml.util.LoadExternalModel.{ + loadJsonStringAsset, + loadTextAsset, + modelSanityCheck, + notSupportedEngineError +} +import com.johnsnowlabs.ml.util.TensorFlow +import com.johnsnowlabs.nlp.AnnotatorType.{DOCUMENT, IMAGE} +import com.johnsnowlabs.nlp._ +import com.johnsnowlabs.nlp.annotators.{RegexTokenizer, Tokenizer, TokenizerModel} +import com.johnsnowlabs.nlp.annotators.cv.feature_extractor.Preprocessor +import com.johnsnowlabs.nlp.annotators.sbd.pragmatic.SentenceDetector +import com.johnsnowlabs.nlp.annotators.tokenizer.bpe.{BertTokenizer, SpecialTokens} +import com.johnsnowlabs.nlp.annotators.tokenizer.wordpiece.BasicTokenizer +import com.johnsnowlabs.nlp.serialization.MapFeature +import org.apache.spark.broadcast.Broadcast +import org.apache.spark.ml.param.{IntArrayParam, IntParam} +import org.apache.spark.ml.util.Identifiable +import org.apache.spark.sql.SparkSession + +class BLIPForQuestionAnswering(override val uid: String) + extends AnnotatorModel[BLIPForQuestionAnswering] + with HasBatchedAnnotateImage[BLIPForQuestionAnswering] + with HasImageFeatureProperties + with WriteTensorflowModel + with HasEngine { + + /** Annotator reference id. Used to identify elements in metadata or to refer to this annotator + * type + */ + def this() = this(Identifiable.randomUID("BLIPForQuestionAnswering")) + + /** Annotator reference id. Used to identify elements in metadata or to refer to this annotator + * type + */ + override val inputAnnotatorTypes: Array[AnnotatorType] = Array(IMAGE) + override val outputAnnotatorType: AnnotatorType = DOCUMENT + + /** ConfigProto from tensorflow, serialized into byte array. Get with + * config_proto.SerializeToString() + * + * @group param + */ + val configProtoBytes = new IntArrayParam( + this, + "configProtoBytes", + "ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()") + + /** ConfigProto from tensorflow, serialized into byte array. Get with + * config_proto.SerializeToString() + * + * @group setParam + */ + def setConfigProtoBytes(bytes: Array[Int]): BLIPForQuestionAnswering.this.type = + set(this.configProtoBytes, bytes) + + /** ConfigProto from tensorflow, serialized into byte array. Get with + * config_proto.SerializeToString() + * + * @group getParam + */ + def getConfigProtoBytes: Option[Array[Byte]] = + get(this.configProtoBytes).map(_.map(_.toByte)) + + /** It contains TF model signatures for the laded saved model + * + * @group param + */ + val signatures = + new MapFeature[String, String](model = this, name = "signatures").setProtected() + + /** @group setParam */ + def setSignatures(value: Map[String, String]): this.type = { + set(signatures, value) + this + } + + /** @group getParam */ + def getSignatures: Option[Map[String, String]] = get(this.signatures) + + /** Vocabulary used to encode the words to ids with WordPieceEncoder + * + * @group param + */ + val vocabulary: MapFeature[String, Int] = new MapFeature(this, "vocabulary").setProtected() + + /** @group setParam */ + def setVocabulary(value: Map[String, Int]): this.type = set(vocabulary, value) + + /** @group getParam */ + protected[nlp] def getVocabulary: Map[String, Int] = $$(vocabulary) + + /** Max sentence length to process (Default: `512`) + * + * @group param + */ + val maxSentenceLength = + new IntParam(this, "maxSentenceLength", "Max sentence length to process") + + /** @group setParam */ + def setMaxSentenceLength(value: Int): this.type = { + set(maxSentenceLength, value) + this + } + + /** @group getParam */ + def getMaxSentenceLength: Int = $(maxSentenceLength) + + private var _model: Option[Broadcast[BLIPClassifier]] = None + + /** @group setParam */ + def setModelIfNotSet( + spark: SparkSession, + preprocessor: Preprocessor, + tensorflow: TensorflowWrapper): this.type = { + if (_model.isEmpty) { + + val specialTokens = SpecialTokens.getSpecialTokensForModel("bert", getVocabulary) + val bertTokenizer = new BertTokenizer(getVocabulary, specialTokens) + + _model = Some( + spark.sparkContext.broadcast( + new BLIPClassifier( + tensorflow, + configProtoBytes = getConfigProtoBytes, + tokenizer = bertTokenizer, + preprocessor = preprocessor, + signatures = getSignatures, + vocabulary = $$(vocabulary)))) + } + this + } + + /** @group getParam */ + def getModelIfNotSet: BLIPClassifier = _model.get.value + + setDefault(batchSize -> 8, size -> 384, maxSentenceLength -> 50) + + /** takes a document and annotations and produces new annotations of this annotator's annotation + * type + * + * @param batchedAnnotations + * Annotations in batches that correspond to inputAnnotationCols generated by previous + * annotators if any + * @return + * any number of annotations processed for every batch of input annotations. Not necessary + * one to one relationship + */ + override def batchAnnotate( + batchedAnnotations: Seq[Array[AnnotationImage]]): Seq[Seq[Annotation]] = { + + batchedAnnotations + .filter { annotationImages => + annotationImages.exists(_.text.nonEmpty) + } + .map { cleanAnnotationImages => + val validImages = cleanAnnotationImages.filter(_.result.nonEmpty) + val questionAnnotations = extractInputAnnotation(validImages) + + getModelIfNotSet.predict( + validImages, + questionAnnotations, + $(batchSize), + $(maxSentenceLength)) + } + } + + private def extractInputAnnotation( + annotationImages: Array[AnnotationImage]): Seq[Annotation] = { + val questions = annotationImages.map(annotationImage => Annotation(annotationImage.text)) + val sentenceAnnotations = + new SentenceDetector().setInputCols("document").setOutputCol("sentence") + val sentencesQuestions = sentenceAnnotations.annotate(questions) + + val tokenizerAnnotation = new RegexTokenizer().setInputCols("sentence").setOutputCol("token") + val tokenQuestions = tokenizerAnnotation.annotate(sentencesQuestions) + + sentencesQuestions ++ tokenQuestions + } + + override def onWrite(path: String, spark: SparkSession): Unit = { + super.onWrite(path, spark) + writeTensorflowModelV2( + path, + spark, + getModelIfNotSet.tensorflowWrapper, + "_image_qa", + BLIPForQuestionAnswering.tfFile, + configProtoBytes = getConfigProtoBytes) + } + +} + +trait ReadablePretrainedBLIPForQuestionAnswering + extends ParamsAndFeaturesReadable[BLIPForQuestionAnswering] + with HasPretrained[BLIPForQuestionAnswering] { + + override val defaultModelName: Some[String] = Some("blip_vqa_tf") + + /** Java compliant-overrides */ + override def pretrained(): BLIPForQuestionAnswering = super.pretrained() + + override def pretrained(name: String): BLIPForQuestionAnswering = + super.pretrained(name) + + override def pretrained(name: String, lang: String): BLIPForQuestionAnswering = + super.pretrained(name, lang) + + override def pretrained( + name: String, + lang: String, + remoteLoc: String): BLIPForQuestionAnswering = + super.pretrained(name, lang, remoteLoc) + +} + +trait ReadBLIPForQuestionAnsweringDLModel extends ReadTensorflowModel { + this: ParamsAndFeaturesReadable[BLIPForQuestionAnswering] => + override val tfFile: String = "blip_vqa_tensorflow" + + def readModel(instance: BLIPForQuestionAnswering, path: String, spark: SparkSession): Unit = { + val tf = readTensorflowModel(path, spark, "_blip_vqa_tf", initAllTables = false) + + val preprocessor = Preprocessor( + do_normalize = true, + do_resize = true, + "BLIPFeatureExtractor", + instance.getImageMean, + instance.getImageStd, + instance.getResample, + instance.getSize) + + instance.setModelIfNotSet(spark, preprocessor, tf) + } + + addReader(readModel) + + def loadSavedModel(modelPath: String, spark: SparkSession): BLIPForQuestionAnswering = { + val (localModelPath, detectedEngine) = modelSanityCheck(modelPath) + val preprocessorConfigJsonContent = + loadJsonStringAsset(localModelPath, "preprocessor_config.json") + val preprocessorConfig = Preprocessor.loadPreprocessorConfig(preprocessorConfigJsonContent) + val vocabs = loadTextAsset(localModelPath, "vocab.txt").zipWithIndex.toMap + + val annotatorModel = new BLIPForQuestionAnswering() + annotatorModel.set(annotatorModel.engine, detectedEngine) + + detectedEngine match { + case TensorFlow.name => + val (wrapper, signatures) = + TensorflowWrapper.read(localModelPath, zipped = false, useBundle = true) + + val _signatures = signatures match { + case Some(s) => s + case None => throw new Exception("Cannot load signature definitions from model!") + } + + /** the order of setSignatures is important if we use getSignatures inside + * setModelIfNotSet + */ + annotatorModel + .setVocabulary(vocabs) + .setSignatures(_signatures) + .setModelIfNotSet(spark, preprocessorConfig, wrapper) + .setSize(384) + + case _ => + throw new Exception(notSupportedEngineError) + } + + annotatorModel + } +} + +object BLIPForQuestionAnswering + extends ReadablePretrainedBLIPForQuestionAnswering + with ReadBLIPForQuestionAnsweringDLModel diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/tokenizer/bpe/BertTokenizer.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/tokenizer/bpe/BertTokenizer.scala new file mode 100644 index 00000000000000..d3650367bbe1cf --- /dev/null +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/tokenizer/bpe/BertTokenizer.scala @@ -0,0 +1,81 @@ +/* + * Copyright 2017-2024 John Snow Labs + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.johnsnowlabs.nlp.annotators.tokenizer.bpe + +import com.johnsnowlabs.nlp.annotators.common.WordpieceTokenizedSentence +import com.johnsnowlabs.nlp.annotators.tokenizer.wordpiece.BasicTokenizer + +import java.nio.charset.Charset +import scala.collection.mutable.ListBuffer + +class BertTokenizer(val vocab: Map[String, Int], val specialTokens: SpecialTokens) + extends BasicTokenizer { + + /** Encode the input sequence to indexes IDs adding padding where necessary */ + def encode( + sentences: Seq[(WordpieceTokenizedSentence, Int)], + maxSequenceLength: Int): Seq[Array[Int]] = { + val maxSentenceLength = + Array( + maxSequenceLength - 2, + sentences.map { case (wpTokSentence, _) => + wpTokSentence.tokens.length + }.max).min + + sentences + .map { case (wpTokSentence, _) => + val tokenPieceIds = wpTokSentence.tokens.map(t => t.pieceId) + val padding = Array.fill(maxSentenceLength - tokenPieceIds.length)(specialTokens.pad.id) + + Array(specialTokens.sentenceStart.id) ++ tokenPieceIds.take(maxSentenceLength) ++ Array( + specialTokens.sentenceEnd.id) ++ padding + } + } + + def decodeTokens(tokens: Array[Int]): String = { + val specialTokens = SpecialTokens.getSpecialTokensForModel("bert", vocab) + val decoderVocab: Map[Int, String] = vocab.map(x => (x._2, x._1)) + val unicodeToByteMapping: Map[String, Int] = + bytesToUnicodeMapping.map(x => (x._2, x._1)) + val text = tokens + .map(token => decoderVocab.getOrElse(token, "")) + .filter(x => !specialTokens.contains(x)) + .mkString("") + val bytes = text.map(x => unicodeToByteMapping(x.toString)).map(x => x.toByte).toArray + new String(bytes, Charset.forName("UTF-8")) + } + + /** Mapping for bytes to a different set of unicode characters (especially white spaces). This + * improved model performance for gpt-2 + */ + protected val bytesToUnicodeMapping: Map[Int, String] = { + val bytes: ListBuffer[Int] = + ListBuffer.range('!', '~' + 1) ++ ListBuffer.range('ยก', 'ยฌ' + 1) ++ ListBuffer + .range('ยฎ', 'รฟ' + 1) + val characters: ListBuffer[Int] = bytes.clone + var n = 0 + for (b <- 0 to 256) { + if (!bytes.contains(b)) { + bytes += b + characters += (256 + n) + n += 1 + } + } + (bytes zip characters.map(_.toChar.toString)).toMap + } + +} diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/tokenizer/bpe/BpeSpecialTokens.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/tokenizer/bpe/BpeSpecialTokens.scala index eb2769a4ad7458..4afb1d5b9bf18c 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/tokenizer/bpe/BpeSpecialTokens.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/tokenizer/bpe/BpeSpecialTokens.scala @@ -170,6 +170,14 @@ private[johnsnowlabs] object SpecialTokens { unkTokenString = "<|endoftext|>", maskTokenString = "<|endoftext|>", padTokenString = "<|endoftext|>") + case "bert" => + SpecialTokens( + vocab, + startTokenString = "[CLS]", + endTokenString = "[SEP]", + unkTokenString = "[UNK]", + maskTokenString = "[MASK]", + padTokenString = "[PAD]") } } diff --git a/src/main/scala/com/johnsnowlabs/nlp/pretrained/PretrainedPipeline.scala b/src/main/scala/com/johnsnowlabs/nlp/pretrained/PretrainedPipeline.scala index 59747ec2c14f21..53ab187d6eca16 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/pretrained/PretrainedPipeline.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/pretrained/PretrainedPipeline.scala @@ -119,7 +119,7 @@ case class PretrainedPipeline( } def fullAnnotateImage(pathToImages: Array[String]): Array[Map[String, Seq[IAnnotation]]] = { - lightModel.fullAnnotateImage(pathToImages) + lightModel.fullAnnotateImages(pathToImages) } def fullAnnotate(audio: Array[Float]): Map[String, Seq[IAnnotation]] = { @@ -157,9 +157,14 @@ case class PretrainedPipeline( lightModel.fullAnnotateImageJava(pathToImage) } - def fullAnnotateImageJava(pathToImages: java.util.ArrayList[String]) + def fullAnnotateImageJava( + pathToImages: java.util.ArrayList[String], + texts: java.util.ArrayList[String]) : java.util.List[java.util.Map[String, java.util.List[IAnnotation]]] = { - lightModel.fullAnnotateJava(pathToImages) + if (texts.isEmpty) { + lightModel.fullAnnotateJava(pathToImages) + } else lightModel.fullAnnotateImageJava(pathToImages, texts) + } def fullAnnotateSingleAudioJava( diff --git a/src/test/scala/com/johnsnowlabs/nlp/AssertAnnotations.scala b/src/test/scala/com/johnsnowlabs/nlp/AssertAnnotations.scala index d1991a8c5db95a..423cb03f8929ed 100644 --- a/src/test/scala/com/johnsnowlabs/nlp/AssertAnnotations.scala +++ b/src/test/scala/com/johnsnowlabs/nlp/AssertAnnotations.scala @@ -105,9 +105,10 @@ object AssertAnnotations { val mode = columnName + ".mode" val result = columnName + ".result" val metadata = columnName + ".metadata" + val text = columnName + ".text" dataSet - .select(annotatorType, origin, height, width, nChannels, mode, result, metadata) + .select(annotatorType, origin, height, width, nChannels, mode, result, metadata, text) .rdd .map { row => val annotatorTypeSeq: Seq[String] = row @@ -134,6 +135,9 @@ object AssertAnnotations { val metadataSeq: Seq[Map[String, String]] = row .getAs[Map[String, String]]("metadata") .asInstanceOf[mutable.WrappedArray[Map[String, String]]] + val textSeq: Seq[String] = row + .getAs[String]("text") + .asInstanceOf[mutable.WrappedArray[String]] originSeq.zipWithIndex.map { case (origin, index) => AnnotationImage( @@ -144,7 +148,8 @@ object AssertAnnotations { nChannelsSeq(index), modeSeq(index), resultSeq(index).asInstanceOf[Array[Byte]], - metadataSeq(index)) + metadataSeq(index), + textSeq(index)) } } .collect() diff --git a/src/test/scala/com/johnsnowlabs/nlp/ImageAssemblerTest.scala b/src/test/scala/com/johnsnowlabs/nlp/ImageAssemblerTest.scala index d9baaf6fa38a82..d48686bafe9c4f 100644 --- a/src/test/scala/com/johnsnowlabs/nlp/ImageAssemblerTest.scala +++ b/src/test/scala/com/johnsnowlabs/nlp/ImageAssemblerTest.scala @@ -21,6 +21,7 @@ import com.johnsnowlabs.nlp.util.io.ResourceHelper import com.johnsnowlabs.tags.{FastTest, SlowTest} import org.apache.spark.ml.Pipeline import org.apache.spark.sql.DataFrame +import org.apache.spark.sql.functions.lit import org.scalatest.flatspec.AnyFlatSpec class ImageAssemblerTest extends AnyFlatSpec { @@ -42,9 +43,32 @@ class ImageAssemblerTest extends AnyFlatSpec { val assembled = imageAssembler.transform(dataFrame) val result = AssertAnnotations.getActualImageResult(assembled, "image_assembler") - assert(result.nonEmpty) + result.foreach(annotationImages => + annotationImages.foreach { annotationImage => + assert(annotationImage.annotatorType == IMAGE) + assert(annotationImage.origin.contains(imagesPath)) + assert(annotationImage.height >= 0) + assert(annotationImage.width >= 0) + assert(annotationImage.nChannels >= 0) + assert(annotationImage.mode >= 0) + assert(annotationImage.result.nonEmpty) + assert(annotationImage.metadata.nonEmpty) + assert(annotationImage.text.isEmpty) + }) + } + + it should "work with text column" in { + + val testDF: DataFrame = dataFrame.withColumn("text", lit("What's this picture about?")) + val imageAssembler: ImageAssembler = new ImageAssembler() + .setInputCol("image") + .setOutputCol("image_assembler") + + val assembled = imageAssembler.transform(testDF) + val result = AssertAnnotations.getActualImageResult(assembled, "image_assembler") + assert(result.nonEmpty) result.foreach(annotationImages => annotationImages.foreach { annotationImage => assert(annotationImage.annotatorType == IMAGE) @@ -55,6 +79,7 @@ class ImageAssemblerTest extends AnyFlatSpec { assert(annotationImage.mode >= 0) assert(annotationImage.result.nonEmpty) assert(annotationImage.metadata.nonEmpty) + assert(annotationImage.text.nonEmpty) }) } @@ -82,7 +107,7 @@ class ImageAssemblerTest extends AnyFlatSpec { val pipeline: Pipeline = new Pipeline().setStages(Array(imageAssembler)) val pipelineModel = pipeline.fit(emptyDF) val lightPipeline = new LightPipeline(pipelineModel) - val result = lightPipeline.fullAnnotateImage(images) + val result = lightPipeline.fullAnnotateImages(images) assert(result.length == images.length) result.foreach(annotation => assert(annotation("image_assembler").nonEmpty)) diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/cv/BLIPForQuestionAnsweringTest.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/cv/BLIPForQuestionAnsweringTest.scala new file mode 100644 index 00000000000000..3b068b6e47a5c9 --- /dev/null +++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/cv/BLIPForQuestionAnsweringTest.scala @@ -0,0 +1,186 @@ +/* + * Copyright 2017-2024 John Snow Labs + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.johnsnowlabs.nlp.annotators.cv + +import com.johnsnowlabs.nlp.base.LightPipeline +import com.johnsnowlabs.nlp.util.io.ResourceHelper +import com.johnsnowlabs.nlp.{Annotation, AssertAnnotations, ImageAssembler} +import com.johnsnowlabs.tags.SlowTest +import org.apache.spark.ml.Pipeline +import org.apache.spark.sql.DataFrame +import org.apache.spark.sql.functions.lit +import org.scalatest.flatspec.AnyFlatSpec + +class BLIPForQuestionAnsweringTest extends AnyFlatSpec { + + private val modelsPath = "/models/transformers" + val tfModelPath = s"$modelsPath/tf/blip-vqa-tf/Salesforce/blip-vqa-base/saved_model/1" + val sparkNLPModelPath = s"$modelsPath/spark-nlp/tf/blip-vqa" + + val model = getBLIPForQuestionAnsweringPipelineModel + + "BLIP" should "load and save model" ignore { + val blipForQuestionAnswering = BLIPForQuestionAnswering + .loadSavedModel(tfModelPath, ResourceHelper.spark) + .setSize(384) + + blipForQuestionAnswering.write.overwrite().save(sparkNLPModelPath) + } + + "BLIP" should "answer a question for a given image" taggedAs SlowTest in { + + val testDF = getTestDF + val result = model.transform(testDF) + + val answerAnnotation = AssertAnnotations.getActualResult(result, "answer") + + answerAnnotation.foreach { annotation => + annotation.foreach(a => assert(a.result.nonEmpty)) + } + } + + it should "work with light pipeline annotate" taggedAs SlowTest in { + val lightPipeline = new LightPipeline(model) + val imagePath = "src/test/resources/image/egyptian_cat.jpeg" + val resultAnnotate = lightPipeline.annotate(imagePath, "What's this picture about?") + println(s"resultAnnotate: $resultAnnotate") + + assert(resultAnnotate("answer").head.contains("cat")) + } + + it should "work with light pipeline full annotate" taggedAs SlowTest in { + val lightPipeline = new LightPipeline(model) + val imagePath = "src/test/resources/image/bluetick.jpg" + val resultFullAnnotate = + lightPipeline.fullAnnotateImage(imagePath, "What's this picture about?") + + val answerAnnotation = resultFullAnnotate("answer").head.asInstanceOf[Annotation] + + println(s"imageName.result: ${answerAnnotation.result}") + assert(answerAnnotation.result.nonEmpty) + } + + it should "fullAnnotate with empty Map when a text is empty" taggedAs SlowTest in { + val lightPipeline = new LightPipeline(model) + val imagesPath = Array( + "src/test/resources/image/bluetick.jpg", + "src/test/resources/image/chihuahua.jpg", + "src/test/resources/image/egyptian_cat.jpeg") + val question = "What's this picture about?" + val questions = Array(question, "", question) + + val resultFullAnnotate = lightPipeline.fullAnnotateImages(imagesPath, questions) + + resultFullAnnotate.zip(imagesPath).foreach { case (annotateMap, imagePath) => + imagePath match { + case "src/test/resources/image/chihuahua.jpg" => + // For the chihuahua image, the annotateMap should be empty because the question is empty + assert( + annotateMap.isEmpty, + s"Expected empty map for image: $imagePath, but got: $annotateMap") + + case _ => + assert(annotateMap.nonEmpty, s"Expected non-empty map for image: $imagePath") + + annotateMap.get("answer") match { + case Some(annotations) => + annotations.foreach { iAnnotation => + val annotation = iAnnotation.asInstanceOf[Annotation] + assert( + annotation.result.nonEmpty, + s"Expected non-empty result for image: $imagePath, but got empty result") + } + case None => + fail(s"'answer' key not found in annotateMap for image: $imagePath") + } + } + } + } + + it should "annotate with empty Map when a text is empty" taggedAs SlowTest in { + val lightPipeline = new LightPipeline(model) + val imagesPath = Array( + "src/test/resources/image/bluetick.jpg", + "src/test/resources/image/chihuahua.jpg", + "src/test/resources/image/egyptian_cat.jpeg") + val question = "What's this picture about?" + val questions = Array(question, "", question) + + val resultAnnotate = lightPipeline.annotate(imagesPath, questions) + + resultAnnotate.foreach { annotate => + println(s"annotate: $annotate") + } + + resultAnnotate.zip(imagesPath).foreach { case (annotateMap, imagePath) => + imagePath match { + case "src/test/resources/image/chihuahua.jpg" => + // For the chihuahua image, the annotateMap should be empty because the question is empty + assert( + annotateMap.isEmpty, + s"Expected empty map for image: $imagePath, but got: $annotateMap") + + case _ => + assert(annotateMap.nonEmpty, s"Expected non-empty map for image: $imagePath") + + annotateMap.get("answer") match { + case Some(annotations) => + annotations.foreach { annotation => + assert( + annotation.nonEmpty, + s"Expected non-empty result for image: $imagePath, but got empty result") + } + case None => + fail(s"'answer' key not found in annotateMap for image: $imagePath") + } + } + } + + } + + private def getBLIPForQuestionAnsweringPipelineModel = { + val testDF = getTestDF + + val imageAssembler: ImageAssembler = new ImageAssembler() + .setInputCol("image") + .setOutputCol("image_assembler") + + val loadModel = BLIPForQuestionAnswering + .pretrained() + .setInputCols("image_assembler") + .setOutputCol("answer") + .setSize(384) + + val newPipeline: Pipeline = + new Pipeline().setStages(Array(imageAssembler, loadModel)) + + newPipeline.fit(testDF) + } + + private def getTestDF: DataFrame = { + val imageFolder = "src/test/resources/image/" + val imageDF: DataFrame = ResourceHelper.spark.read + .format("image") + .option("dropInvalid", value = true) + .load(imageFolder) + + val testDF: DataFrame = imageDF.withColumn("text", lit("What's this picture about?")) + + testDF + } + +} diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/cv/CLIPForZeroShotClassificationTestSpec.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/cv/CLIPForZeroShotClassificationTestSpec.scala index 85b43a790634ab..92491fc1abddac 100644 --- a/src/test/scala/com/johnsnowlabs/nlp/annotators/cv/CLIPForZeroShotClassificationTestSpec.scala +++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/cv/CLIPForZeroShotClassificationTestSpec.scala @@ -74,7 +74,7 @@ class CLIPForZeroShotClassificationTestSpec extends AnyFlatSpec { val pipelineModel = pipeline.fit(imageDF) val lightPipeline = new LightPipeline(pipelineModel) val images = expected.keys.map(imageFolder + _).toArray - val result = lightPipeline.fullAnnotateImage(images) + val result = lightPipeline.fullAnnotateImages(images) result.foreach { row: Map[String, Seq[IAnnotation]] => val imageName = diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/cv/ViTImageClassificationTestSpec.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/cv/ViTImageClassificationTestSpec.scala index fdf2e43b574a81..0eacd5378bde6f 100644 --- a/src/test/scala/com/johnsnowlabs/nlp/annotators/cv/ViTImageClassificationTestSpec.scala +++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/cv/ViTImageClassificationTestSpec.scala @@ -159,7 +159,7 @@ trait ViTForImageClassificationBehaviors { this: AnyFlatSpec => val images = Array("src/test/resources/image/hen.JPEG", "src/test/resources/image/missing_file.mf") - val predictions = lightPipeline.fullAnnotateImage(images) + val predictions = lightPipeline.fullAnnotateImages(images) assert(predictions(0)("image_assembler").nonEmpty) assert(predictions(0)("class").nonEmpty) @@ -185,7 +185,7 @@ trait ViTForImageClassificationBehaviors { this: AnyFlatSpec => val images = Array("src/test/resources/image/hen.JPEG", "this is a text") - val predictions = lightPipeline.fullAnnotateImage(images) + val predictions = lightPipeline.fullAnnotateImages(images) assert(predictions(0)("image_assembler").nonEmpty) assert(predictions(0)("class").nonEmpty) @@ -232,7 +232,7 @@ class ViTImageClassificationTestSpec extends AnyFlatSpec with ViTForImageClassif "tractor.JPEG" -> "tractor", "ox.JPEG" -> "ox") - private lazy val model: ViTForImageClassification = ViTForImageClassification.pretrained() + private val model: ViTForImageClassification = ViTForImageClassification.pretrained() it should behave like behaviorsViTForImageClassification[ViTForImageClassification]( diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/cv/VisionEncoderDecoderForImageCaptioningTestSpec.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/cv/VisionEncoderDecoderForImageCaptioningTestSpec.scala index 64aae2c9d330b9..b67e2684ea432a 100644 --- a/src/test/scala/com/johnsnowlabs/nlp/annotators/cv/VisionEncoderDecoderForImageCaptioningTestSpec.scala +++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/cv/VisionEncoderDecoderForImageCaptioningTestSpec.scala @@ -88,7 +88,7 @@ class VisionEncoderDecoderForImageCaptioningTestSpec extends AnyFlatSpec { val pipelineModel = pipeline.fit(imageDF) val lightPipeline = new LightPipeline(pipelineModel) val image = imageFolder + "egyptian_cat.jpeg" - val results = lightPipeline.fullAnnotateImage(Array(image, image)) + val results = lightPipeline.fullAnnotateImages(Array(image, image)) results.foreach { result => assert(result("image_assembler").nonEmpty) From af0c319331829bb7e982dd077b3670b1a229509c Mon Sep 17 00:00:00 2001 From: Danilo Burbano Date: Wed, 2 Oct 2024 18:09:26 -0500 Subject: [PATCH 02/24] [SPARKNLP-1068] Adding BLIPForQuestionAnswering import notebook example --- ...n_Spark_NLP_BLIPForQuestionAnswering.ipynb | 3425 +++++++++++++++++ 1 file changed, 3425 insertions(+) create mode 100644 examples/python/transformers/HuggingFace_in_Spark_NLP_BLIPForQuestionAnswering.ipynb diff --git a/examples/python/transformers/HuggingFace_in_Spark_NLP_BLIPForQuestionAnswering.ipynb b/examples/python/transformers/HuggingFace_in_Spark_NLP_BLIPForQuestionAnswering.ipynb new file mode 100644 index 00000000000000..c1e15d7d45bf1f --- /dev/null +++ b/examples/python/transformers/HuggingFace_in_Spark_NLP_BLIPForQuestionAnswering.ipynb @@ -0,0 +1,3425 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "UiBTGTRfSCQh" + }, + "source": [ + "![JohnSnowLabs](https://sparknlp.org/assets/images/logo.png)\n", + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_CLIP.ipynb)\n", + "\n", + "# Import ONNX BLIP models from HuggingFace ๐Ÿค— into Spark NLP ๐Ÿš€\n", + "\n", + "Let's keep in mind a few things before we start ๐Ÿ˜Š\n", + "\n", + "- This feature is only in `Spark NLP 5.5.1` and after. So please make sure you have upgraded to the latest Spark NLP release\n", + "- You can import BLIP models trained/fine-tuned for question answering via `TFBlipForQuestionAnswering`.\n", + "- Reference: [TFBlipForQuestionAnswering](https://huggingface.co/docs/transformers/en/model_doc/blip#transformers.TFBlipForQuestionAnswering)\n", + "- Some [example models](https://huggingface.co/models?pipeline_tag=visual-question-answering&sort=trending&search=BLIP)\n", + "- To execute this notebook on Google Colab you will need an A100 or similar instance" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "vkGbcTagUK4P" + }, + "source": [ + "## Export and Save HuggingFace model\n", + "\n", + "- We lock TensorFlow on `2.11.0` version and Transformers on `4.39.3`. This doesn't mean it won't work with the future releases, but we wanted you to know which versions have been tested successfully." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "N9RXtKzHaEvi", + "outputId": "5631c0ca-0f5f-4f38-c9ab-9a5591906067" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[2K \u001b[90mโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”\u001b[0m \u001b[32m588.3/588.3 MB\u001b[0m \u001b[31m3.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90mโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”\u001b[0m \u001b[32m1.7/1.7 MB\u001b[0m \u001b[31m40.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90mโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”\u001b[0m \u001b[32m1.1/1.1 MB\u001b[0m \u001b[31m46.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90mโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”\u001b[0m \u001b[32m6.0/6.0 MB\u001b[0m \u001b[31m77.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90mโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”\u001b[0m \u001b[32m439.2/439.2 kB\u001b[0m \u001b[31m22.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90mโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”\u001b[0m \u001b[32m4.9/4.9 MB\u001b[0m \u001b[31m86.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90mโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”\u001b[0m \u001b[32m781.3/781.3 kB\u001b[0m \u001b[31m41.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", + "cudf-cu12 24.4.1 requires protobuf<5,>=3.20, but you have protobuf 3.19.6 which is incompatible.\n", + "google-cloud-aiplatform 1.67.1 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0dev,>=3.20.2, but you have protobuf 3.19.6 which is incompatible.\n", + "google-cloud-bigquery-connection 1.15.5 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0dev,>=3.20.2, but you have protobuf 3.19.6 which is incompatible.\n", + "google-cloud-bigquery-storage 2.26.0 requires protobuf!=3.20.0,!=3.20.1,!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0dev,>=3.20.2, but you have protobuf 3.19.6 which is incompatible.\n", + "google-cloud-bigtable 2.26.0 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0dev,>=3.20.2, but you have protobuf 3.19.6 which is incompatible.\n", + "google-cloud-functions 1.16.5 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0dev,>=3.20.2, but you have protobuf 3.19.6 which is incompatible.\n", + "google-cloud-iam 2.15.2 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0dev,>=3.20.2, but you have protobuf 3.19.6 which is incompatible.\n", + "google-cloud-language 2.13.4 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0dev,>=3.20.2, but you have protobuf 3.19.6 which is incompatible.\n", + "google-cloud-pubsub 2.23.1 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0dev,>=3.20.2, but you have protobuf 3.19.6 which is incompatible.\n", + "google-cloud-resource-manager 1.12.5 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0dev,>=3.20.2, but you have protobuf 3.19.6 which is incompatible.\n", + "google-cloud-translate 3.15.5 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0dev,>=3.20.2, but you have protobuf 3.19.6 which is incompatible.\n", + "googleapis-common-protos 1.65.0 requires protobuf!=3.20.0,!=3.20.1,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0.dev0,>=3.20.2, but you have protobuf 3.19.6 which is incompatible.\n", + "grpc-google-iam-v1 0.13.1 requires protobuf!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0dev,>=3.20.2, but you have protobuf 3.19.6 which is incompatible.\n", + "pandas-gbq 0.23.1 requires google-auth-oauthlib>=0.7.0, but you have google-auth-oauthlib 0.4.6 which is incompatible.\n", + "tensorflow-datasets 4.9.6 requires protobuf>=3.20, but you have protobuf 3.19.6 which is incompatible.\n", + "tensorflow-metadata 1.15.0 requires protobuf<4.21,>=3.20.3; python_version < \"3.11\", but you have protobuf 3.19.6 which is incompatible.\n", + "tf-keras 2.17.0 requires tensorflow<2.18,>=2.17, but you have tensorflow 2.11.0 which is incompatible.\u001b[0m\u001b[31m\n", + "\u001b[0m" + ] + } + ], + "source": [ + "!pip install -q tensorflow==2.11.0" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "fIGek4zAUVM9" + }, + "source": [ + "- HuggingFace comes with a native `saved_model` feature inside `save_pretrained` function for TensorFlow based models. We will use that to save it as TF `SavedModel`.\n", + "- We'll use [Salesforce/blip-vqa-base](https://huggingface.co/Salesforce/blip-vqa-base) model from HuggingFace as an example\n", + "- In addition to `TFBlipForQuestionAnswering` we also need to save the `BlipProcessor`." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "id": "n1tqMsNXK5lN" + }, + "outputs": [], + "source": [ + "from PIL import Image\n", + "import requests\n", + "from transformers import BlipProcessor, TFBlipForQuestionAnswering\n", + "import tensorflow as tf" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "id": "PiEKBy42ezX7" + }, + "outputs": [], + "source": [ + "MODEL_NAME = \"Salesforce/blip-vqa-base\"" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 353, + "referenced_widgets": [ + "a8fc97ee9a5646268761e3362eb07ccd", + "0bf25fe03bcb4c9f9c0c2556d7a1ea99", + "58cac0f27ae347debd32014c34b37a1e", + "4e7a8a4a4bef4012bb7c8d3f31056ac2", + "bfbe18f452db43bea36212209eceac60", + "427370f1a81246fd85323abba58483ac", + "158c854e5e744216b485e8e0eaf33d14", + "d07cf17e58214062be88f5da1c55221b", + "2ea6b3a04c274905b5cdb76a4d1d197a", + "b03cae4fb10a47b5ac4b69cdaaa913d0", + "55e8c34dfbbb48f6b00a16762f107787", + "800ef838b66343659fffc789449c0a9f", + "22215a25c1f04cf3bc994b91716ecd91", + "a572bc9c98bb49598735bd4af9cef841", + "9c4125362fc44efea531faf2d48e6e04", + "a93f052249df447481ecf3531e52dcb2", + "ebf1f217cdef4024a9aecd90c2471986", + "98adb63f15664ac88046d941690cf13c", + "a2d6850c56e04bc08633717c569a6393", + "749cdc9d728e4ff18ec8192eb0062789", + "569e4bb367274c37bab0a314cd998e23", + "228cdee565d545f9a35b7bcbeafd29e7", + "cb4387e38cfb462ab8d53466ad9c69c8", + "26f1c75dbc8d4faab3c5874c1fbc9802", + "04e16cc0b237449299e3858c9db4295f", + "39a19e2bca9c4c1cb057cb225e90f0cf", + "9dfb9fa922954e2fac9867039e35a8bd", + "98f5799ac2314802a4d5565c05b93597", + "6331f40bb5394cb9b0ca9c5dfb104d6c", + "76f07bae7301446280b973486572e9fa", + "252ed515f22a48e2b97857e453945fb5", + "9717a812f3f84fc9ae100f9915f680df", + "22b606b09395484aaea3946d02319eca", + "2264d7fdc4a14032b4704c0caa64d8fb", + "b8c1b72a53ca4b14b7ff874942819011", + "c1048df076c946db8909c7091b82fcfa", + "6ee8baa1c4624a74835f0a434da22ce6", + "c375f592a3ab4dbbb2ff2dd98817dc1c", + "b71dcd5229a9409b83a45c561cd57489", + "9a0d0ec79a8142c3b5113bce264adeb9", + "3c2c91312ae146f8b1e95d3e81ad0056", + "ad23ef6e0c64424bb28127a9bf6b4951", + "7a99d35b201b45ceb9f18bb21bbf5cee", + "dfbd503e8f31449fa7c2358001fc77cb", + "151a916c65ee4196ae7cb53406365c45", + "33e4be1c2ce040baae33e3f100dad4f6", + "f71322f009844d02830f45b40632dc6a", + "58baacaa12b840ef9fb48bdd797ed498", + "ff0bd78c11b34f92a861029aeb3c9d3a", + "4f71c03378fc4ede80dd4c07b319df8d", + "4e345925052f464fb4aaaa92a1bd4fc7", + "e167c4bf6725441d89edcd705ba032be", + "eca99f2c5400456d92948305189d66a6", + "aebced9d65414171a2b8bc0602be1993", + "9c4c3703c5ed48c9a753797ee56b00fc" + ] + }, + "id": "NgLAnDuhexzT", + "outputId": "0612907f-81f6-4526-e16a-25822771db73" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.10/dist-packages/huggingface_hub/utils/_token.py:89: UserWarning: \n", + "The secret `HF_TOKEN` does not exist in your Colab secrets.\n", + "To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.\n", + "You will be able to reuse this secret in all of your notebooks.\n", + "Please note that authentication is recommended but still optional to access public models or datasets.\n", + " warnings.warn(\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "a8fc97ee9a5646268761e3362eb07ccd", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "preprocessor_config.json: 0%| | 0.00/445 [00:00> and will run it as-is.\n", + "Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.\n", + "Cause: 'NoneType' object has no attribute '_fields'\n", + "To silence this warning, decorate the function with @tf.autograph.experimental.do_not_convert\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "WARNING: AutoGraph could not transform > and will run it as-is.\n", + "Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.\n", + "Cause: 'NoneType' object has no attribute '_fields'\n", + "To silence this warning, decorate the function with @tf.autograph.experimental.do_not_convert\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.\n", + "WARNING:tensorflow:AutoGraph could not transform > and will run it as-is.\n", + "Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.\n", + "Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method\n", + "To silence this warning, decorate the function with @tf.autograph.experimental.do_not_convert\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "WARNING: AutoGraph could not transform > and will run it as-is.\n", + "Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.\n", + "Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method\n", + "To silence this warning, decorate the function with @tf.autograph.experimental.do_not_convert\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.10/dist-packages/tensorflow/python/autograph/impl/api.py:371: UserWarning: Using the model-agnostic default `max_length` (=20) to control the generation length. recommend setting `max_new_tokens` to control the maximum length of the generation.\n", + " return py_builtins.overload_of(f)(*args)\n", + "WARNING:absl:Found untraced functions such as serving, serving, serving, serving, patch_embedding_layer_call_fn while saving (showing 5 of 1569). These functions will not be directly callable after loading.\n" + ] + } + ], + "source": [ + "# Define TF Signature\n", + "@tf.function(\n", + " input_signature=[\n", + " {\n", + " \"pixel_values\": tf.TensorSpec((1, None, None, None), tf.float32, name=\"pixel_values\"),\n", + " \"input_ids\": tf.TensorSpec((1, None), tf.int32, name=\"input_ids\"),\n", + " \"attention_mask\": tf.TensorSpec((1, None), tf.int64, name=\"attention_mask\")\n", + " }\n", + " ]\n", + ")\n", + "def serving_fn(inputs):\n", + " # Unpack the input dictionary and pass it to the model's generate function\n", + " return model.generate(\n", + " input_ids=inputs[\"input_ids\"],\n", + " pixel_values=inputs[\"pixel_values\"],\n", + " attention_mask=inputs.get(\"attention_mask\", None)\n", + " )\n", + "\n", + "model.save_pretrained(\"./{}\".format(MODEL_NAME), saved_model=True, signatures={\"serving_default\": serving_fn.get_concrete_function()})" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "FYF-xt3HWEr0" + }, + "source": [ + "Let's have a look inside these two directories and see what we are dealing with:" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "oTlKokmrsVDR", + "outputId": "b56b637b-76a8-4471-f908-908dc44bd117" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "total 936\n", + "-rw-r--r-- 1 root root 471 Oct 2 18:10 preprocessor_config.json\n", + "-rw-r--r-- 1 root root 695 Oct 2 18:10 special_tokens_map.json\n", + "-rw-r--r-- 1 root root 1348 Oct 2 18:10 tokenizer_config.json\n", + "-rw-r--r-- 1 root root 711396 Oct 2 18:10 tokenizer.json\n", + "-rw-r--r-- 1 root root 231508 Oct 2 18:10 vocab.txt\n" + ] + } + ], + "source": [ + "!ls -l {MODEL_NAME}_blip_processor" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "hVzKx5bUWGny", + "outputId": "b4d9ae80-f865-4e1e-825c-a02a68ce9958" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "total 1503636\n", + "-rw-r--r-- 1 root root 664 Oct 2 18:18 config.json\n", + "-rw-r--r-- 1 root root 136 Oct 2 18:18 generation_config.json\n", + "drwxr-xr-x 3 root root 4096 Oct 2 18:14 saved_model\n", + "-rw-r--r-- 1 root root 1539703504 Oct 2 18:18 tf_model.h5\n" + ] + } + ], + "source": [ + "!ls -l {MODEL_NAME}" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "JcEP4XF9WXYb", + "outputId": "2952576f-b7a6-411f-9487-605be09b654c" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "total 61764\n", + "drwxr-xr-x 2 root root 4096 Oct 2 18:14 assets\n", + "-rw-r--r-- 1 root root 55 Oct 2 18:18 fingerprint.pb\n", + "-rw-r--r-- 1 root root 604021 Oct 2 18:18 keras_metadata.pb\n", + "-rw-r--r-- 1 root root 62626669 Oct 2 18:18 saved_model.pb\n", + "drwxr-xr-x 2 root root 4096 Oct 2 18:17 variables\n" + ] + } + ], + "source": [ + "!ls -l {MODEL_NAME}/saved_model/1" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "WQ0yckQRsYCx" + }, + "source": [ + "So we need to move the files `preprocessor_config.json`, `tokenizer.json` and `vocab.txt` from processor to assets" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "HWaeOrl6UDOI" + }, + "source": [ + "- As you can see, we need the SavedModel from `saved_model/1/` path\n", + "- We also be needing `preprocessor_config.json`, `tokenizer.json` and `vocab.txt` from processor\n", + "- All we need is to just copy those files to `saved_model/1/assets` which Spark NLP will look for" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "id": "xiuyWqlLs4OL" + }, + "outputs": [], + "source": [ + "!mv {MODEL_NAME}_blip_processor/preprocessor_config.json {MODEL_NAME}/saved_model/1/assets\n", + "!mv {MODEL_NAME}_blip_processor/tokenizer.json {MODEL_NAME}/saved_model/1/assets\n", + "!mv {MODEL_NAME}_blip_processor/vocab.txt {MODEL_NAME}/saved_model/1/assets" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "wa1yVpATVrZv" + }, + "source": [ + "Voila! We have our `preprocessor_config.json`, `tokenizer.json` and `vocab.txt` inside assets directory" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "ljkBpPTftE8G", + "outputId": "e5922df7-f2be-409e-e395-83e2974a5750" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "total 928\n", + "-rw-r--r-- 1 root root 471 Oct 2 18:10 preprocessor_config.json\n", + "-rw-r--r-- 1 root root 711396 Oct 2 18:10 tokenizer.json\n", + "-rw-r--r-- 1 root root 231508 Oct 2 18:10 vocab.txt\n" + ] + } + ], + "source": [ + "!ls -l {MODEL_NAME}/saved_model/1/assets" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "7NdEMMiXTQbn" + }, + "source": [ + "## Import and Save BertForQuestionAnswering in Spark NLP" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "YumDH6zHV1af" + }, + "source": [ + "Let's install and setup Spark NLP in Google Colab\n", + "This part is pretty easy via our simple script" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "id": "Qb994CB80vU-" + }, + "outputs": [], + "source": [ + "! wget -q http://setup.johnsnowlabs.com/colab.sh -O - | bash" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "klO_mqUs1WgE", + "outputId": "ff8b25e6-ea0c-4d59-fded-db93e3213d97" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/lib/python3.10/subprocess.py:1796: RuntimeWarning: os.fork() was called. os.fork() is incompatible with multithreaded code, and JAX is multithreaded, so this will likely lead to a deadlock.\n", + " self.pid = _posixsubprocess.fork_exec(\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Apache Spark version: 3.4.0\n" + ] + } + ], + "source": [ + "import sparknlp\n", + "# let's start Spark with Spark NLP\n", + "spark = sparknlp.start()\n", + "\n", + "print(\"Apache Spark version: {}\".format(spark.version))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Yj1LrqgXSp22" + }, + "source": [ + "- Let's use `loadSavedModel` functon in `BLIPForQuestionAnswering` which allows us to load TensorFlow model in SavedModel format\n", + "- `loadSavedModel` accepts two params, first is the path to the TF SavedModel. The second is the SparkSession that is `spark` variable we previously started via `sparknlp.start()`\n", + "- NOTE: `loadSavedModel` accepts local paths in addition to distributed file systems such as `HDFS`, `S3`, `DBFS`, etc. This feature was introduced in Spark NLP 4.2.2 release. Keep in mind the best and recommended way to move/share/reuse Spark NLP models is to use `write.save` so you can use `.load()` from any file systems natively." + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": { + "id": "s0IKr6l21dmt" + }, + "outputs": [], + "source": [ + "from sparknlp.annotator import *\n", + "from sparknlp.base import *\n", + "\n", + "blip_for_question_answering = BLIPForQuestionAnswering.loadSavedModel(\n", + " '{}/saved_model/1'.format(MODEL_NAME),\n", + " spark\n", + " )\\\n", + " .setSize(384)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "S2SXFXqqV7io" + }, + "source": [ + "Let's save it on disk so it is easier to be moved around and also be used later via .load function" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": { + "id": "O_WLb5WTV-sI" + }, + "outputs": [], + "source": [ + "blip_for_question_answering.write().overwrite().save(\"./{}_spark_nlp\".format(MODEL_NAME))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "8c-9B3fXWDqi" + }, + "source": [ + "Let's clean up stuff we don't need anymore" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": { + "id": "qNTTflXjWELp" + }, + "outputs": [], + "source": [ + "!rm -rf {MODEL_NAME}_blip_processor {MODEL_NAME}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "bMNZ2gdcWPJI" + }, + "source": [ + "Awesome ๐Ÿ˜Ž !\n", + "\n", + "This is your BLIPForQuestionAnswering model from HuggingFace ๐Ÿค— loaded and saved by Spark NLP ๐Ÿš€" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "JPoiZrbg-agf", + "outputId": "e8be56dd-f998-499c-f8e5-b738ce81a989" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "total 1563412\n", + "-rw-r--r-- 1 root root 1600921187 Oct 2 18:42 blip_vqa_tensorflow\n", + "drwxr-xr-x 4 root root 4096 Oct 2 18:41 fields\n", + "drwxr-xr-x 2 root root 4096 Oct 2 18:41 metadata\n" + ] + } + ], + "source": [ + "! ls -l {MODEL_NAME}_spark_nlp" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Oizr-BZYWVmj" + }, + "source": [ + "Now let's see how we can use it on other machines, clusters, or any place you wish to use your new and shiny BLIPForQuestionAnswering model in Spark NLP ๐Ÿš€ pipeline!" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "kfXocFvjWbOq" + }, + "source": [ + "Let's try with a public image of cats" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "qNGGZSbxAkSp", + "outputId": "70c64f2f-3347-460e-8df2-d02fb036ff32" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--2024-10-02 18:42:30-- http://images.cocodataset.org/val2017/000000039769.jpg\n", + "Resolving images.cocodataset.org (images.cocodataset.org)... 3.5.27.152, 3.5.29.161, 16.182.34.49, ...\n", + "Connecting to images.cocodataset.org (images.cocodataset.org)|3.5.27.152|:80... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 173131 (169K) [image/jpeg]\n", + "Saving to: โ€˜/content/cat_image.jpgโ€™\n", + "\n", + "/content/cat_image. 100%[===================>] 169.07K 312KB/s in 0.5s \n", + "\n", + "2024-10-02 18:42:31 (312 KB/s) - โ€˜/content/cat_image.jpgโ€™ saved [173131/173131]\n", + "\n" + ] + } + ], + "source": [ + "!wget -O /content/cat_image.jpg \"http://images.cocodataset.org/val2017/000000039769.jpg\"" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": { + "id": "MDeYB-PGAvgA" + }, + "outputs": [], + "source": [ + "!mkdir images\n", + "!mv cat_image.jpg images" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "l6Ii_rwDWn3J" + }, + "source": [ + "To proceed, please create a DataFrame with two columns:\n", + "\n", + "- An `image` column that contains the file path for each image in the directory.\n", + "- A `text` column where you can input the specific question you would like to ask about each image." + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "GlJRrn7NA5_3", + "outputId": "13703fbb-0085-49dd-9909-212bc45624f1" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+--------------------+--------------------+\n", + "| image| text|\n", + "+--------------------+--------------------+\n", + "|{file:///content/...|What's this pictu...|\n", + "+--------------------+--------------------+\n", + "\n" + ] + } + ], + "source": [ + "from pyspark.sql.functions import lit\n", + "\n", + "images_path = \"./images/\"\n", + "image_df = spark.read.format(\"image\").load(path=images_path)\n", + "\n", + "test_df = image_df.withColumn(\"text\", lit(\"What's this picture about?\"))\n", + "test_df.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "XO8RXVifXNbZ" + }, + "source": [ + "Now let's build our `BLIPForQuestionAnswering` pipeline" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": { + "id": "00MxfP2KBKpW" + }, + "outputs": [], + "source": [ + "imageAssembler = ImageAssembler() \\\n", + " .setInputCol(\"image\") \\\n", + " .setOutputCol(\"image_assembler\") \\\n", + "\n", + "imageClassifier = BLIPForQuestionAnswering.load(\"./{}_spark_nlp\".format(MODEL_NAME)) \\\n", + " .setInputCols(\"image_assembler\") \\\n", + " .setOutputCol(\"answer\") \\\n", + " .setSize(384)\n", + "\n", + "pipeline = Pipeline(\n", + " stages=[\n", + " imageAssembler,\n", + " imageClassifier,\n", + " ]\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": { + "id": "m3z6twXbBhw4" + }, + "outputs": [], + "source": [ + "model = pipeline.fit(test_df)\n", + "result = model.transform(test_df)" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "_8NQhgilCGDO", + "outputId": "ed295952-9553-4780-f3fd-9a6adea89fe7" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+--------------------------------------+------+\n", + "|origin |result|\n", + "+--------------------------------------+------+\n", + "|[file:///content/images/cat_image.jpg]|[cats]|\n", + "+--------------------------------------+------+\n", + "\n" + ] + } + ], + "source": [ + "result.select(\"image_assembler.origin\", \"answer.result\").show(truncate = False)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "YDvCiVP3XXPd" + }, + "source": [ + "That's it! You can now go wild and use hundreds of `BLIPForQuestionAnswering` models from HuggingFace ๐Ÿค— in Spark NLP ๐Ÿš€\n" + ] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "gpuType": "A100", + "machine_shape": "hm", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "04e16cc0b237449299e3858c9db4295f": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_76f07bae7301446280b973486572e9fa", + "max": 231508, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_252ed515f22a48e2b97857e453945fb5", + "value": 231508 + } + }, + "0b1ed81f489c4fd09ab7bb1d1ad938fb": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "0bf25fe03bcb4c9f9c0c2556d7a1ea99": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_427370f1a81246fd85323abba58483ac", + "placeholder": "โ€‹", + "style": "IPY_MODEL_158c854e5e744216b485e8e0eaf33d14", + "value": "preprocessor_config.json:โ€‡100%" + } + }, + "0e3e739b6a5c4e4aaec788974ef551b5": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_7c4edf1f672042e68e6a15e7da5a0127", + "placeholder": "โ€‹", + "style": "IPY_MODEL_21951a3e1c6a4650851d4ee31cd2387f", + "value": "โ€‡1.54G/1.54Gโ€‡[00:51<00:00,โ€‡29.4MB/s]" + } + }, + "111f56022b3c4737a9f643143673c6b5": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "151a916c65ee4196ae7cb53406365c45": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_33e4be1c2ce040baae33e3f100dad4f6", + "IPY_MODEL_f71322f009844d02830f45b40632dc6a", + "IPY_MODEL_58baacaa12b840ef9fb48bdd797ed498" + ], + "layout": "IPY_MODEL_ff0bd78c11b34f92a861029aeb3c9d3a" + } + }, + "158c854e5e744216b485e8e0eaf33d14": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "16ddbd3fcb7f4dba8e8b48d6f6962046": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_111f56022b3c4737a9f643143673c6b5", + "placeholder": "โ€‹", + "style": "IPY_MODEL_af46ebc1d3d84a8589920ee7338936cf", + "value": "config.json:โ€‡100%" + } + }, + "18317efb0631479bbbd6f373942c7349": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "21951a3e1c6a4650851d4ee31cd2387f": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "22215a25c1f04cf3bc994b91716ecd91": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_ebf1f217cdef4024a9aecd90c2471986", + "placeholder": "โ€‹", + "style": "IPY_MODEL_98adb63f15664ac88046d941690cf13c", + "value": "tokenizer_config.json:โ€‡100%" + } + }, + "2264d7fdc4a14032b4704c0caa64d8fb": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_b8c1b72a53ca4b14b7ff874942819011", + "IPY_MODEL_c1048df076c946db8909c7091b82fcfa", + "IPY_MODEL_6ee8baa1c4624a74835f0a434da22ce6" + ], + "layout": "IPY_MODEL_c375f592a3ab4dbbb2ff2dd98817dc1c" + } + }, + "228cdee565d545f9a35b7bcbeafd29e7": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "22b606b09395484aaea3946d02319eca": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "252ed515f22a48e2b97857e453945fb5": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "26f1c75dbc8d4faab3c5874c1fbc9802": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_98f5799ac2314802a4d5565c05b93597", + "placeholder": "โ€‹", + "style": "IPY_MODEL_6331f40bb5394cb9b0ca9c5dfb104d6c", + "value": "vocab.txt:โ€‡100%" + } + }, + "2ea6b3a04c274905b5cdb76a4d1d197a": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "33e4be1c2ce040baae33e3f100dad4f6": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_4f71c03378fc4ede80dd4c07b319df8d", + "placeholder": "โ€‹", + "style": "IPY_MODEL_4e345925052f464fb4aaaa92a1bd4fc7", + "value": "special_tokens_map.json:โ€‡100%" + } + }, + "39202d00e08f49d196159bdd16c29f6f": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_6fe7e0e408d54752ae71d47a58f31469", + "placeholder": "โ€‹", + "style": "IPY_MODEL_ddddfea881df4a7b89845fb4485edf0d", + "value": "model.safetensors:โ€‡100%" + } + }, + "39a19e2bca9c4c1cb057cb225e90f0cf": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_9717a812f3f84fc9ae100f9915f680df", + "placeholder": "โ€‹", + "style": "IPY_MODEL_22b606b09395484aaea3946d02319eca", + "value": "โ€‡232k/232kโ€‡[00:00<00:00,โ€‡668kB/s]" + } + }, + "3c2c91312ae146f8b1e95d3e81ad0056": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "427370f1a81246fd85323abba58483ac": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "4e345925052f464fb4aaaa92a1bd4fc7": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "4e7a8a4a4bef4012bb7c8d3f31056ac2": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_b03cae4fb10a47b5ac4b69cdaaa913d0", + "placeholder": "โ€‹", + "style": "IPY_MODEL_55e8c34dfbbb48f6b00a16762f107787", + "value": "โ€‡445/445โ€‡[00:00<00:00,โ€‡32.3kB/s]" + } + }, + "4f5e6c1c45794f03aed2dd7223dd3255": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_16ddbd3fcb7f4dba8e8b48d6f6962046", + "IPY_MODEL_d11879914a854d8a91a4872ef4afc942", + "IPY_MODEL_ec039adb3b1f4522a7dac4386040590a" + ], + "layout": "IPY_MODEL_f7de63cc1da94daf9dc83406301873a3" + } + }, + "4f71c03378fc4ede80dd4c07b319df8d": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "559b67a1bb9240a887a34c9eafda45eb": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "55e8c34dfbbb48f6b00a16762f107787": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "569e4bb367274c37bab0a314cd998e23": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "58baacaa12b840ef9fb48bdd797ed498": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_aebced9d65414171a2b8bc0602be1993", + "placeholder": "โ€‹", + "style": "IPY_MODEL_9c4c3703c5ed48c9a753797ee56b00fc", + "value": "โ€‡125/125โ€‡[00:00<00:00,โ€‡11.2kB/s]" + } + }, + "58cac0f27ae347debd32014c34b37a1e": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_d07cf17e58214062be88f5da1c55221b", + "max": 445, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_2ea6b3a04c274905b5cdb76a4d1d197a", + "value": 445 + } + }, + "5ce925ad60054d518453a6c6ae8d1707": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "626dcbd9418949b0b7e5dc8680f9b19b": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_704723b61c674d3d9c322f6b31c9830a", + "max": 1538800584, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_559b67a1bb9240a887a34c9eafda45eb", + "value": 1538800584 + } + }, + "6331f40bb5394cb9b0ca9c5dfb104d6c": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "6ee8baa1c4624a74835f0a434da22ce6": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_7a99d35b201b45ceb9f18bb21bbf5cee", + "placeholder": "โ€‹", + "style": "IPY_MODEL_dfbd503e8f31449fa7c2358001fc77cb", + "value": "โ€‡711k/711kโ€‡[00:00<00:00,โ€‡1.37MB/s]" + } + }, + "6fe7e0e408d54752ae71d47a58f31469": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "704723b61c674d3d9c322f6b31c9830a": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "749cdc9d728e4ff18ec8192eb0062789": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "763498ed74e6446a972930ab96d5d4d8": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "76f07bae7301446280b973486572e9fa": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "7a99d35b201b45ceb9f18bb21bbf5cee": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "7c4edf1f672042e68e6a15e7da5a0127": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "800ef838b66343659fffc789449c0a9f": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_22215a25c1f04cf3bc994b91716ecd91", + "IPY_MODEL_a572bc9c98bb49598735bd4af9cef841", + "IPY_MODEL_9c4125362fc44efea531faf2d48e6e04" + ], + "layout": "IPY_MODEL_a93f052249df447481ecf3531e52dcb2" + } + }, + "9717a812f3f84fc9ae100f9915f680df": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "98adb63f15664ac88046d941690cf13c": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "98f5799ac2314802a4d5565c05b93597": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "9a0d0ec79a8142c3b5113bce264adeb9": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "9c4125362fc44efea531faf2d48e6e04": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_569e4bb367274c37bab0a314cd998e23", + "placeholder": "โ€‹", + "style": "IPY_MODEL_228cdee565d545f9a35b7bcbeafd29e7", + "value": "โ€‡592/592โ€‡[00:00<00:00,โ€‡53.5kB/s]" + } + }, + "9c4c3703c5ed48c9a753797ee56b00fc": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "9dfb9fa922954e2fac9867039e35a8bd": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "a2d6850c56e04bc08633717c569a6393": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "a572bc9c98bb49598735bd4af9cef841": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_a2d6850c56e04bc08633717c569a6393", + "max": 592, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_749cdc9d728e4ff18ec8192eb0062789", + "value": 592 + } + }, + "a8fc97ee9a5646268761e3362eb07ccd": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_0bf25fe03bcb4c9f9c0c2556d7a1ea99", + "IPY_MODEL_58cac0f27ae347debd32014c34b37a1e", + "IPY_MODEL_4e7a8a4a4bef4012bb7c8d3f31056ac2" + ], + "layout": "IPY_MODEL_bfbe18f452db43bea36212209eceac60" + } + }, + "a9265e8b56b14330a51ac0e07faab189": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_39202d00e08f49d196159bdd16c29f6f", + "IPY_MODEL_626dcbd9418949b0b7e5dc8680f9b19b", + "IPY_MODEL_0e3e739b6a5c4e4aaec788974ef551b5" + ], + "layout": "IPY_MODEL_5ce925ad60054d518453a6c6ae8d1707" + } + }, + "a93f052249df447481ecf3531e52dcb2": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "ad23ef6e0c64424bb28127a9bf6b4951": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "aebced9d65414171a2b8bc0602be1993": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "af46ebc1d3d84a8589920ee7338936cf": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "b03cae4fb10a47b5ac4b69cdaaa913d0": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "b71dcd5229a9409b83a45c561cd57489": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "b8c1b72a53ca4b14b7ff874942819011": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_b71dcd5229a9409b83a45c561cd57489", + "placeholder": "โ€‹", + "style": "IPY_MODEL_9a0d0ec79a8142c3b5113bce264adeb9", + "value": "tokenizer.json:โ€‡100%" + } + }, + "bfbe18f452db43bea36212209eceac60": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "c1048df076c946db8909c7091b82fcfa": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_3c2c91312ae146f8b1e95d3e81ad0056", + "max": 711396, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_ad23ef6e0c64424bb28127a9bf6b4951", + "value": 711396 + } + }, + "c375f592a3ab4dbbb2ff2dd98817dc1c": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "caa25abd3df346da806da3659070ae87": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "cb4387e38cfb462ab8d53466ad9c69c8": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_26f1c75dbc8d4faab3c5874c1fbc9802", + "IPY_MODEL_04e16cc0b237449299e3858c9db4295f", + "IPY_MODEL_39a19e2bca9c4c1cb057cb225e90f0cf" + ], + "layout": "IPY_MODEL_9dfb9fa922954e2fac9867039e35a8bd" + } + }, + "d07cf17e58214062be88f5da1c55221b": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "d11879914a854d8a91a4872ef4afc942": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_caa25abd3df346da806da3659070ae87", + "max": 4559, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_0b1ed81f489c4fd09ab7bb1d1ad938fb", + "value": 4559 + } + }, + "ddddfea881df4a7b89845fb4485edf0d": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "dfbd503e8f31449fa7c2358001fc77cb": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "e167c4bf6725441d89edcd705ba032be": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "ebf1f217cdef4024a9aecd90c2471986": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "ec039adb3b1f4522a7dac4386040590a": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_763498ed74e6446a972930ab96d5d4d8", + "placeholder": "โ€‹", + "style": "IPY_MODEL_18317efb0631479bbbd6f373942c7349", + "value": "โ€‡4.56k/4.56kโ€‡[00:00<00:00,โ€‡378kB/s]" + } + }, + "eca99f2c5400456d92948305189d66a6": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "f71322f009844d02830f45b40632dc6a": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_e167c4bf6725441d89edcd705ba032be", + "max": 125, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_eca99f2c5400456d92948305189d66a6", + "value": 125 + } + }, + "f7de63cc1da94daf9dc83406301873a3": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "ff0bd78c11b34f92a861029aeb3c9d3a": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + } + } + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} From c256e1648af4addf0e5bb663e8f4c93ede986258 Mon Sep 17 00:00:00 2001 From: Danilo Burbano Date: Thu, 3 Oct 2024 12:57:23 -0500 Subject: [PATCH 03/24] [SPARKNLP-1068] Fix fullAnnotateImage validation --- src/main/scala/com/johnsnowlabs/nlp/LightPipeline.scala | 4 +--- src/test/scala/com/johnsnowlabs/nlp/ImageAssemblerTest.scala | 2 +- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/src/main/scala/com/johnsnowlabs/nlp/LightPipeline.scala b/src/main/scala/com/johnsnowlabs/nlp/LightPipeline.scala index d6793fdba19e8e..20236a5732f3fd 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/LightPipeline.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/LightPipeline.scala @@ -78,8 +78,7 @@ class LightPipeline(val pipelineModel: PipelineModel, parseEmbeddings: Boolean = } def fullAnnotateImage(pathToImage: String, text: String = ""): Map[String, Seq[IAnnotation]] = { - val isValidFile = ResourceHelper.validFile(pathToImage) - if (!isValidFile || isValidFile && text.isEmpty) { + if (!ResourceHelper.validFile(pathToImage)) { Map() } else fullAnnotateInternal(pathToImage, text) } @@ -374,7 +373,6 @@ class LightPipeline(val pipelineModel: PipelineModel, parseEmbeddings: Boolean = pathToImages: java.util.ArrayList[String], texts: java.util.ArrayList[String]) : java.util.List[java.util.Map[String, java.util.List[IAnnotation]]] = { - if (texts.isEmpty) { pathToImages.asScala.par .map { imageFilePath => diff --git a/src/test/scala/com/johnsnowlabs/nlp/ImageAssemblerTest.scala b/src/test/scala/com/johnsnowlabs/nlp/ImageAssemblerTest.scala index d48686bafe9c4f..0161fbdff4e35c 100644 --- a/src/test/scala/com/johnsnowlabs/nlp/ImageAssemblerTest.scala +++ b/src/test/scala/com/johnsnowlabs/nlp/ImageAssemblerTest.scala @@ -58,7 +58,7 @@ class ImageAssemblerTest extends AnyFlatSpec { }) } - it should "work with text column" in { + it should "work with text column" taggedAs FastTest in { val testDF: DataFrame = dataFrame.withColumn("text", lit("What's this picture about?")) val imageAssembler: ImageAssembler = new ImageAssembler() From 7c46662a6e7e78d53b41ed2d27c4a02a38243677 Mon Sep 17 00:00:00 2001 From: Danilo Burbano Date: Thu, 3 Oct 2024 13:48:27 -0500 Subject: [PATCH 04/24] [SPARKNLP-1068] Solves BLIPForQuestionAnsweringTest issue --- .../cv/BLIPForQuestionAnsweringTest.scala | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/cv/BLIPForQuestionAnsweringTest.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/cv/BLIPForQuestionAnsweringTest.scala index 3b068b6e47a5c9..d511151316ce96 100644 --- a/src/test/scala/com/johnsnowlabs/nlp/annotators/cv/BLIPForQuestionAnsweringTest.scala +++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/cv/BLIPForQuestionAnsweringTest.scala @@ -27,19 +27,7 @@ import org.scalatest.flatspec.AnyFlatSpec class BLIPForQuestionAnsweringTest extends AnyFlatSpec { - private val modelsPath = "/models/transformers" - val tfModelPath = s"$modelsPath/tf/blip-vqa-tf/Salesforce/blip-vqa-base/saved_model/1" - val sparkNLPModelPath = s"$modelsPath/spark-nlp/tf/blip-vqa" - - val model = getBLIPForQuestionAnsweringPipelineModel - - "BLIP" should "load and save model" ignore { - val blipForQuestionAnswering = BLIPForQuestionAnswering - .loadSavedModel(tfModelPath, ResourceHelper.spark) - .setSize(384) - - blipForQuestionAnswering.write.overwrite().save(sparkNLPModelPath) - } + lazy val model = getBLIPForQuestionAnsweringPipelineModel "BLIP" should "answer a question for a given image" taggedAs SlowTest in { From 1b4b29d1e0c53dfd934c9d6ecf6df6082932e163 Mon Sep 17 00:00:00 2001 From: Danilo Burbano Date: Fri, 4 Oct 2024 07:49:32 -0500 Subject: [PATCH 05/24] [SPARKNLP-1068] Updates default BLIPForQuestionAnswering model name --- python/sparknlp/annotator/cv/blip_for_question_answering.py | 2 +- .../nlp/annotators/cv/BLIPForQuestionAnswering.scala | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/python/sparknlp/annotator/cv/blip_for_question_answering.py b/python/sparknlp/annotator/cv/blip_for_question_answering.py index b861449e27d862..6153ddd61fe302 100644 --- a/python/sparknlp/annotator/cv/blip_for_question_answering.py +++ b/python/sparknlp/annotator/cv/blip_for_question_answering.py @@ -84,7 +84,7 @@ def loadSavedModel(folder, spark_session): return BLIPForQuestionAnswering(java_model=jModel) @staticmethod - def pretrained(name="blip_vqa_tf", lang="en", remote_loc=None): + def pretrained(name="blip_vqa_base", lang="en", remote_loc=None): """Downloads and loads a pretrained model. Parameters diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/cv/BLIPForQuestionAnswering.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/cv/BLIPForQuestionAnswering.scala index 9cd5bca6ff9e35..88c21943b06928 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/cv/BLIPForQuestionAnswering.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/cv/BLIPForQuestionAnswering.scala @@ -219,7 +219,7 @@ trait ReadablePretrainedBLIPForQuestionAnswering extends ParamsAndFeaturesReadable[BLIPForQuestionAnswering] with HasPretrained[BLIPForQuestionAnswering] { - override val defaultModelName: Some[String] = Some("blip_vqa_tf") + override val defaultModelName: Some[String] = Some("blip_vqa_base") /** Java compliant-overrides */ override def pretrained(): BLIPForQuestionAnswering = super.pretrained() From e121763255b2296234858699a95d9d417fbb4c89 Mon Sep 17 00:00:00 2001 From: Danilo Burbano Date: Wed, 9 Oct 2024 12:44:23 -0500 Subject: [PATCH 06/24] [SPARKNLP-1068] [skip test] Adding documentation to BLIPForQuestionAnswering --- .../cv/blip_for_question_answering.py | 67 +++++++++++++- .../cv/BLIPForQuestionAnswering.scala | 87 ++++++++++++++++++- 2 files changed, 151 insertions(+), 3 deletions(-) diff --git a/python/sparknlp/annotator/cv/blip_for_question_answering.py b/python/sparknlp/annotator/cv/blip_for_question_answering.py index 6153ddd61fe302..fe018c0e683bf2 100644 --- a/python/sparknlp/annotator/cv/blip_for_question_answering.py +++ b/python/sparknlp/annotator/cv/blip_for_question_answering.py @@ -20,6 +20,71 @@ class BLIPForQuestionAnswering(AnnotatorModel, HasEngine, HasCandidateLabelsProperties, HasRescaleFactor): + """BLIPForQuestionAnswering can load BLIP models for visual question answering. + The model consists of a vision encoder, a text encoder as well as a text decoder. + The vision encoder will encode the input image, the text encoder will encode the input question together + with the encoding of the image, and the text decoder will output the answer to the question. + + Pretrained models can be loaded with :meth:`.pretrained` of the companion + object: + + >>> visualQAClassifier = BLIPForQuestionAnswering.pretrained() \\ + ... .setInputCols(["image_assembler"]) \\ + ... .setOutputCol("answer") + + The default model is ``"blip_vqa_base"``, if no name is + provided. + + For available pretrained models please see the `Models Hub + `__. + + To see which models are compatible and how to import them see + `Import Transformers into Spark NLP ๐Ÿš€ + `_. + + ====================== ====================== + Input Annotation types Output Annotation type + ====================== ====================== + ``IMAGE`` ``DOCUMENT`` + ====================== ====================== + + Parameters + ---------- + batchSize + Batch size. Large values allows faster processing but requires more + memory, by default 2 + configProtoBytes + ConfigProto from tensorflow, serialized into byte array. + maxSentenceLength + Max sentence length to process, by default 50 + + Examples + -------- + >>> import sparknlp + >>> from sparknlp.base import * + >>> from sparknlp.annotator import * + >>> from pyspark.ml import Pipeline + >>> image_df = SparkSessionForTest.spark.read.format("image").load(path=images_path) + >>> test_df = image_df.withColumn("text", lit("What's this picture about?")) + >>> imageAssembler = ImageAssembler() \\ + ... .setInputCol("image") \\ + ... .setOutputCol("image_assembler") + >>> visualQAClassifier = BLIPForQuestionAnswering.pretrained() \\ + ... .setInputCols("image_assembler") \\ + ... .setOutputCol("answer") \\ + ... .setSize(384) + >>> pipeline = Pipeline().setStages([ + ... imageAssembler, + ... visualQAClassifier + ... ]) + >>> result = pipeline.fit(test_df).transform(test_df) + >>> result.select("image_assembler.origin", "answer.result").show(false) + +--------------------------------------+------+ + |origin |result| + +--------------------------------------+------+ + |[file:///content/images/cat_image.jpg]|[cats]| + +--------------------------------------+------+ + """ name = "BLIPForQuestionAnswering" @@ -59,7 +124,7 @@ def __init__(self, classname="com.johnsnowlabs.nlp.annotators.cv.BLIPForQuestion ) self._setDefault( batchSize=2, - size=224, + size=384, maxSentenceLength=50 ) diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/cv/BLIPForQuestionAnswering.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/cv/BLIPForQuestionAnswering.scala index 88c21943b06928..a0f15de929cafb 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/cv/BLIPForQuestionAnswering.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/cv/BLIPForQuestionAnswering.scala @@ -31,17 +31,100 @@ import com.johnsnowlabs.ml.util.LoadExternalModel.{ import com.johnsnowlabs.ml.util.TensorFlow import com.johnsnowlabs.nlp.AnnotatorType.{DOCUMENT, IMAGE} import com.johnsnowlabs.nlp._ -import com.johnsnowlabs.nlp.annotators.{RegexTokenizer, Tokenizer, TokenizerModel} +import com.johnsnowlabs.nlp.annotators.RegexTokenizer import com.johnsnowlabs.nlp.annotators.cv.feature_extractor.Preprocessor import com.johnsnowlabs.nlp.annotators.sbd.pragmatic.SentenceDetector import com.johnsnowlabs.nlp.annotators.tokenizer.bpe.{BertTokenizer, SpecialTokens} -import com.johnsnowlabs.nlp.annotators.tokenizer.wordpiece.BasicTokenizer import com.johnsnowlabs.nlp.serialization.MapFeature import org.apache.spark.broadcast.Broadcast import org.apache.spark.ml.param.{IntArrayParam, IntParam} import org.apache.spark.ml.util.Identifiable import org.apache.spark.sql.SparkSession +/** BLIPForQuestionAnswering can load BLIP models for visual question answering. The model + * consists of a vision encoder, a text encoder as well as a text decoder. The vision encoder + * will encode the input image, the text encoder will encode the input question together with the + * encoding of the image, and the text decoder will output the answer to the question. + * + * Pretrained models can be loaded with `pretrained` of the companion object: + * {{{ + * val visualQAClassifier = BLIPForQuestionAnswering.pretrained() + * .setInputCols("image_assembler") + * .setOutputCol("answer") + * }}} + * The default model is `"blip_vqa_base"`, if no name is provided. + * + * For available pretrained models please see the + * [[https://sparknlp.org/models?task=Question+Answering Models Hub]]. + * + * Models from the HuggingFace ๐Ÿค— Transformers library are also compatible with Spark NLP ๐Ÿš€. To + * see which models are compatible and how to import them see + * [[https://github.com/JohnSnowLabs/spark-nlp/discussions/5669]] and to see more extended + * examples, see + * [[https://github.com/JohnSnowLabs/spark-nlp/blob/master/src/test/scala/com/johnsnowlabs/nlp/annotators/cv/BLIPForQuestionAnsweringTest.scala]]. + * + * ==Example== + * {{{ + * import spark.implicits._ + * import com.johnsnowlabs.nlp.base._ + * import com.johnsnowlabs.nlp.annotator._ + * import org.apache.spark.ml.Pipeline + * + * val imageDF: DataFrame = ResourceHelper.spark.read + * .format("image") + * .option("dropInvalid", value = true) + * .load(imageFolder) + * + * val testDF: DataFrame = imageDF.withColumn("text", lit("What's this picture about?")) + * + * val imageAssembler: ImageAssembler = new ImageAssembler() + * .setInputCol("image") + * .setOutputCol("image_assembler") + * + * val visualQAClassifier = BLIPForQuestionAnswering.pretrained() + * .setInputCols("image_assembler") + * .setOutputCol("answer") + * + * val pipeline = new Pipeline().setStages(Array( + * imageAssembler, + * visualQAClassifier + * )) + * + * val result = pipeline.fit(testDF).transform(testDF) + * + * result.select("image_assembler.origin", "answer.result").show(false) + * +--------------------------------------+------+ + * |origin |result| + * +--------------------------------------+------+ + * |[file:///content/images/cat_image.jpg]|[cats]| + * +--------------------------------------+------+ + * }}} + * + * @see + * [[CLIPForZeroShotClassification]] for Zero Shot Image Classifier + * @see + * [[https://sparknlp.org/docs/en/annotators Annotators Main Page]] for a list of transformer + * based classifiers + * @param uid + * required uid for storing annotator to disk + * @groupname anno Annotator types + * @groupdesc anno + * Required input and expected output annotator types + * @groupname Ungrouped Members + * @groupname param Parameters + * @groupname setParam Parameter setters + * @groupname getParam Parameter getters + * @groupname Ungrouped Members + * @groupprio param 1 + * @groupprio anno 2 + * @groupprio Ungrouped 3 + * @groupprio setParam 4 + * @groupprio getParam 5 + * @groupdesc param + * A list of (hyper-)parameter keys this annotator can take. Users can set and get the + * parameter values through setters and getters, respectively. + */ + class BLIPForQuestionAnswering(override val uid: String) extends AnnotatorModel[BLIPForQuestionAnswering] with HasBatchedAnnotateImage[BLIPForQuestionAnswering] From c1e072e7a908cab362bcd86003b078e677c658de Mon Sep 17 00:00:00 2001 From: Maziyar Panahi Date: Fri, 18 Oct 2024 18:16:34 +0200 Subject: [PATCH 07/24] Add a new llama_cpp engine (#14436) --- src/main/scala/com/johnsnowlabs/ml/util/ModelEngine.scala | 5 +++++ .../johnsnowlabs/nlp/annotators/seq2seq/AutoGGUFModel.scala | 5 ++++- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/src/main/scala/com/johnsnowlabs/ml/util/ModelEngine.scala b/src/main/scala/com/johnsnowlabs/ml/util/ModelEngine.scala index 02ecbc1d626082..e75a3ce29c61a9 100644 --- a/src/main/scala/com/johnsnowlabs/ml/util/ModelEngine.scala +++ b/src/main/scala/com/johnsnowlabs/ml/util/ModelEngine.scala @@ -33,6 +33,7 @@ final case object ONNX extends ModelEngine { val decoderModel = "decoder_model.onnx" val decoderWithPastModel = "decoder_with_past_model.onnx" } + final case object Openvino extends ModelEngine { val name = "openvino" val ovModel = "openvino_model" @@ -41,6 +42,10 @@ final case object Openvino extends ModelEngine { val decoderModelWithPast = "openvino_decoder_with_past_model" } +final case object LlamaCPP extends ModelEngine { + val name = "llama_cpp" +} + final case object Unknown extends ModelEngine { val name = "unk" } diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/AutoGGUFModel.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/AutoGGUFModel.scala index e681ce99888010..8049ba6b642473 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/AutoGGUFModel.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/AutoGGUFModel.scala @@ -16,9 +16,9 @@ package com.johnsnowlabs.nlp.annotators.seq2seq import com.johnsnowlabs.ml.gguf.GGUFWrapper +import com.johnsnowlabs.ml.util.LlamaCPP import com.johnsnowlabs.nlp._ import com.johnsnowlabs.nlp.util.io.ResourceHelper -import com.johnsnowlabs.nlp.llama.LlamaModel import org.apache.spark.broadcast.Broadcast import org.apache.spark.ml.util.Identifiable import org.apache.spark.sql.SparkSession @@ -153,6 +153,8 @@ class AutoGGUFModel(override val uid: String) this } + private[johnsnowlabs] def setEngine(engineName: String): this.type = set(engine, engineName) + override def onWrite(path: String, spark: SparkSession): Unit = { super.onWrite(path, spark) getModelIfNotSet.saveToFile(path) @@ -261,6 +263,7 @@ trait ReadAutoGGUFModel { val annotatorModel = new AutoGGUFModel() annotatorModel .setModelIfNotSet(spark, GGUFWrapper.read(spark, localPath)) + .setEngine(LlamaCPP.name) val metadata = LlamaModel.getMetadataFromFile(localPath) if (metadata.nonEmpty) annotatorModel.setMetadata(metadata) From f763fac47f60cde6c015f802e4e8f22fbc8ad240 Mon Sep 17 00:00:00 2001 From: Abdullah mubeen <77073730+AbdullahMubeenAnwar@users.noreply.github.com> Date: Fri, 18 Oct 2024 09:48:34 -0700 Subject: [PATCH 08/24] tasks-docs-integration (#14428) * Update navigation.yml Adding "- Tasks" page to SparkNLP Docs * adding task files --- docs/_data/navigation.yml | 2 + docs/en/tasks/automatic_speech_recognition.md | 149 ++++++++++++ docs/en/tasks/dependency_parsing.md | 178 +++++++++++++++ docs/en/tasks/image_captioning.md | 151 +++++++++++++ docs/en/tasks/image_classification.md | 154 +++++++++++++ docs/en/tasks/landing_page.md | 171 ++++++++++++++ docs/en/tasks/question_answering.md | 156 +++++++++++++ docs/en/tasks/summarization.md | 150 +++++++++++++ docs/en/tasks/table_question_answering.md | 212 ++++++++++++++++++ docs/en/tasks/text_classification.md | 164 ++++++++++++++ docs/en/tasks/text_generation.md | 180 +++++++++++++++ docs/en/tasks/text_preprocessing.md | 170 ++++++++++++++ docs/en/tasks/token_classification.md | 176 +++++++++++++++ docs/en/tasks/translation.md | 135 +++++++++++ docs/en/tasks/zero_shot_classification.md | 152 +++++++++++++ .../tasks/zero_shot_image_classification.md | 186 +++++++++++++++ 16 files changed, 2486 insertions(+) create mode 100644 docs/en/tasks/automatic_speech_recognition.md create mode 100644 docs/en/tasks/dependency_parsing.md create mode 100644 docs/en/tasks/image_captioning.md create mode 100644 docs/en/tasks/image_classification.md create mode 100644 docs/en/tasks/landing_page.md create mode 100644 docs/en/tasks/question_answering.md create mode 100644 docs/en/tasks/summarization.md create mode 100644 docs/en/tasks/table_question_answering.md create mode 100644 docs/en/tasks/text_classification.md create mode 100644 docs/en/tasks/text_generation.md create mode 100644 docs/en/tasks/text_preprocessing.md create mode 100644 docs/en/tasks/token_classification.md create mode 100644 docs/en/tasks/translation.md create mode 100644 docs/en/tasks/zero_shot_classification.md create mode 100644 docs/en/tasks/zero_shot_image_classification.md diff --git a/docs/_data/navigation.yml b/docs/_data/navigation.yml index c6e75a2a846237..85688b6c357880 100755 --- a/docs/_data/navigation.yml +++ b/docs/_data/navigation.yml @@ -44,6 +44,8 @@ sparknlp: url: /docs/en/pipelines - title: General Concepts url: /docs/en/concepts + - title: Tasks + url: /docs/en/tasks/landing_page - title: Annotators url: /docs/en/annotators - title: Transformers diff --git a/docs/en/tasks/automatic_speech_recognition.md b/docs/en/tasks/automatic_speech_recognition.md new file mode 100644 index 00000000000000..ee0e96ccdc774a --- /dev/null +++ b/docs/en/tasks/automatic_speech_recognition.md @@ -0,0 +1,149 @@ +--- +layout: docs +header: true +seotitle: +title: Automatic Speech Recognition +permalink: docs/en/tasks/automatic_speech_recognition +key: docs-tasks-automatic-speech-recognition +modify_date: "2024-09-26" +show_nav: true +sidebar: + nav: sparknlp +--- + +**Automatic Speech Recognition (ASR)** is the technology that enables computers to recognize and process human speech into text. ASR plays a vital role in numerous applications, from voice-activated assistants to transcription services, making it an essential part of modern natural language processing (NLP) solutions. Spark NLP provides powerful tools for implementing ASR systems effectively. + +In this context, ASR involves converting spoken language into text by analyzing audio signals. Common use cases include: + +- **Voice Assistants:** Enabling devices like smartphones and smart speakers to understand and respond to user commands. +- **Transcription Services:** Automatically converting audio recordings from meetings, interviews, or lectures into written text. +- **Accessibility:** Helping individuals with disabilities interact with technology through voice commands. + +By leveraging ASR, organizations can enhance user experience, improve accessibility, and streamline workflows that involve audio data. + +
+ +
+ +## Picking a Model + +When selecting a model for Automatic Speech Recognition, itโ€™s essential to evaluate several factors to ensure optimal performance for your specific use case. Begin by analyzing the **nature of your audio data**, considering the accent, language, and quality of the recordings. Determine if your task requires **real-time transcription** or if batch processing is sufficient, as some models excel in specific scenarios. + +Next, assess the **model complexity**; simpler models may suffice for straightforward tasks, while more sophisticated models are better suited for nuanced speech recognition. Consider the **availability of diverse audio data** for training, as larger datasets can significantly enhance model performance. Define key **performance metrics** (e.g., word error rate, accuracy) to guide your choice, and ensure the model's interpretability meets your requirements. Finally, account for **resource constraints**, as advanced models typically demand more memory and processing power. + +To explore and select from a variety of models, visit [Spark NLP Models](https://sparknlp.org/models), where you can find models tailored for different ASR tasks and languages. + +#### Recommended Models for Automatic Speech Recognition Tasks +- **General Speech Recognition:** Use models like [`asr_wav2vec2_large_xlsr_53_english_by_jonatasgrosman`](https://sparknlp.org/2022/09/24/asr_wav2vec2_large_xlsr_53_english_by_jonatasgrosman_en.html){:target="_blank"} for general-purpose transcription. +- **Multilingual Support:** For applications requiring support for multiple languages, consider using models like [`asr_wav2vec2_large_xlsr_53_portuguese_by_jonatasgrosman`](https://sparknlp.org/2021/12/15/wav2vec2.html){:target="_blank"} from the [`Wav2Vec2ForCTC`](https://sparknlp.org/docs/en/transformers#wav2vec2forctc){:target="_blank"} transformer. + +By thoughtfully considering these factors and using the right models, you can enhance your ASR applications significantly. + +## How to use + +
+{% include programmingLanguageSelectScalaPython.html %} +```python +import sparknlp +from sparknlp.base import * +from sparknlp.annotator import * +from pyspark.ml import Pipeline + +# Step 1: Assemble the raw audio content into a suitable format +audioAssembler = AudioAssembler() \ + .setInputCol("audio_content") \ + .setOutputCol("audio_assembler") + +# Step 2: Load a pre-trained Wav2Vec2 model for automatic speech recognition (ASR) +speechToText = Wav2Vec2ForCTC \ + .pretrained() \ + .setInputCols(["audio_assembler"]) \ + .setOutputCol("text") + +# Step 3: Define the pipeline with audio assembler and speech-to-text model +pipeline = Pipeline().setStages([audioAssembler, speechToText]) + +# Step 4: Create a DataFrame containing the raw audio content (as floats) +processedAudioFloats = spark.createDataFrame([[rawFloats]]).toDF("audio_content") + +# Step 5: Fit the pipeline and transform the audio data +result = pipeline.fit(processedAudioFloats).transform(processedAudioFloats) + +# Step 6: Display the transcribed text from the audio +result.select("text.result").show(truncate = False) + ++------------------------------------------------------------------------------------------+ +|result | ++------------------------------------------------------------------------------------------+ +|[MISTER QUILTER IS THE APOSTLE OF THE MIDLE CLASES AND WE ARE GLAD TO WELCOME HIS GOSPEL ]| ++------------------------------------------------------------------------------------------+ +``` +```scala +import spark.implicits._ +import com.johnsnowlabs.nlp.base._ +import com.johnsnowlabs.nlp.annotators._ +import com.johnsnowlabs.nlp.annotators.audio.Wav2Vec2ForCTC +import org.apache.spark.ml.Pipeline + +// Step 1: Assemble the raw audio content into a suitable format +val audioAssembler: AudioAssembler = new AudioAssembler() + .setInputCol("audio_content") + .setOutputCol("audio_assembler") + +// Step 2: Load a pre-trained Wav2Vec2 model for automatic speech recognition (ASR) +val speechToText: Wav2Vec2ForCTC = Wav2Vec2ForCTC + .pretrained() + .setInputCols("audio_assembler") + .setOutputCol("text") + +// Step 3: Define the pipeline with audio assembler and speech-to-text model +val pipeline: Pipeline = new Pipeline().setStages(Array(audioAssembler, speechToText)) + +// Step 4: Load raw audio floats from a CSV file +val bufferedSource = + scala.io.Source.fromFile("src/test/resources/audio/csv/audio_floats.csv") + +// Step 5: Extract raw audio floats from CSV and convert to an array of floats +val rawFloats = bufferedSource + .getLines() + .map(_.split(",").head.trim.toFloat) + .toArray +bufferedSource.close + +// Step 6: Create a DataFrame with raw audio content (as floats) +val processedAudioFloats = Seq(rawFloats).toDF("audio_content") + +// Step 7: Fit the pipeline and transform the audio data +val result = pipeline.fit(processedAudioFloats).transform(processedAudioFloats) + +// Step 8: Display the transcribed text from the audio +result.select("text.result").show(truncate = false) + ++------------------------------------------------------------------------------------------+ +|result | ++------------------------------------------------------------------------------------------+ +|[MISTER QUILTER IS THE APOSTLE OF THE MIDLE CLASES AND WE ARE GLAD TO WELCOME HIS GOSPEL ]| ++------------------------------------------------------------------------------------------+ +``` +
+ +## Try Real-Time Demos! + +If you want to see the outputs of ASR models in real time, visit our interactive demos: + +- **[Wav2Vec2ForCTC](https://huggingface.co/spaces/abdullahmubeen10/sparknlp-Wav2Vec2ForCTC){:target="_blank"}** โ€“ Try this powerful model for real-time speech-to-text from raw audio. +- **[WhisperForCTC](https://huggingface.co/spaces/abdullahmubeen10/sparknlp-WhisperForCTC){:target="_blank"}** โ€“ Test speech recognition in multiple languages and noisy environments. +- **[HubertForCTC](https://huggingface.co/spaces/abdullahmubeen10/sparknlp-HubertForCTC){:target="_blank"}** โ€“ Experience quick and accurate voice command recognition. + +## Useful Resources + +Want to dive deeper into Automatic Speech Recognition with Spark NLP? Here are somText Preprocessinge curated resources to help you get started and explore further: + +**Articles and Guides** +- *[Converting Speech to Text with Spark NLP and Python](https://www.johnsnowlabs.com/converting-speech-to-text-with-spark-nlp-and-python/){:target="_blank"}* +- *[Simplify Your Speech Recognition Workflow with SparkNLP](https://medium.com/spark-nlp/simplify-your-speech-recognition-workflow-with-sparknlp-e381606e4e82){:target="_blank"}* +- *[Vision Transformers and Automatic Speech Recognition in Spark NLP](https://www.nlpsummit.org/vision-transformers-and-automatic-speech-recognition-in-spark-nlp/){:target="_blank"}* + +**Notebooks** +- *[Automatic Speech Recognition in Spark NLP](https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Public/17.Speech_Recognition.ipynb){:target="_blank"}* +- *[Speech Recognition Transformers in Spark NLP](https://github.com/JohnSnowLabs/spark-nlp/tree/master/examples/python/annotation/audio){:target="_blank"}* diff --git a/docs/en/tasks/dependency_parsing.md b/docs/en/tasks/dependency_parsing.md new file mode 100644 index 00000000000000..5b91558e98adab --- /dev/null +++ b/docs/en/tasks/dependency_parsing.md @@ -0,0 +1,178 @@ +--- +layout: docs +header: true +seotitle: +title: Dependency Parsing +permalink: docs/en/tasks/dependency_parsing +key: docs-tasks-dependency-parsing +modify_date: "2024-09-28" +show_nav: true +sidebar: + nav: sparknlp +--- + +**Dependency Parsing** is a syntactic analysis task that focuses on the grammatical structure of sentences. It identifies the dependencies between words, showcasing how they relate in terms of grammar. Spark NLP provides advanced dependency parsing models that can accurately analyze sentence structures, enabling various applications in natural language processing. + +Dependency parsing models process input sentences and generate a structured representation of word relationships. Common use cases include: + +- **Grammatical Analysis:** Understanding the grammatical structure of sentences for better comprehension. +- **Information Extraction:** Identifying key relationships and entities in sentences for tasks like knowledge graph construction. + +By using Spark NLP dependency parsing models, you can build efficient systems to analyze and understand sentence structures accurately. + +## Picking a Model + +When selecting a dependency parsing model, consider factors such as the **language of the text** and the **complexity of sentence structures**. Some models may be optimized for specific languages or types of text. Evaluate whether you need **detailed syntactic parsing** or a more **general analysis** based on your application. + +Explore the available dependency parsing models at [Spark NLP Models](https://sparknlp.org/models) to find the one that best fits your requirements. + +#### Recommended Models for Dependency Parsing Tasks + +- **General Dependency Parsing:** Consider models such as [`dependency_conllu_en_3_0`](https://sparknlp.org/2022/06/29/dependency_conllu_en_3_0.html){:target="_blank"} for analyzing English sentences. You can also explore language-specific models tailored for non-English languages. + +Choosing the appropriate model ensures you produce accurate syntactic structures that suit your specific language and use case. + +## How to use + +
+{% include programmingLanguageSelectScalaPython.html %} +```python +from sparknlp.annotator import * +from sparknlp.base import * +from pyspark.ml import Pipeline + +# Document Assembler: Converts raw text into a document format suitable for processing +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +# Sentence Detector: Splits text into individual sentences +sentenceDetector = SentenceDetector() \ + .setInputCols(["document"]) \ + .setOutputCol("sentence") + +# Tokenizer: Breaks sentences into tokens (words) +tokenizer = Tokenizer() \ + .setInputCols(["sentence"]) \ + .setOutputCol("token") + +# Part-of-Speech Tagger: Tags each token with its respective POS (pretrained model) +posTagger = PerceptronModel.pretrained() \ + .setInputCols(["token", "sentence"]) \ + .setOutputCol("pos") + +# Dependency Parser: Analyzes the grammatical structure of a sentence +dependencyParser = DependencyParserModel.pretrained() \ + .setInputCols(["sentence", "pos", "token"]) \ + .setOutputCol("dependency") + +# Typed Dependency Parser: Assigns typed labels to the dependencies +typedDependencyParser = TypedDependencyParserModel.pretrained() \ + .setInputCols(["token", "pos", "dependency"]) \ + .setOutputCol("labdep") + +# Create a pipeline that includes all the stages +pipeline = Pipeline(stages=[ + documentAssembler, + sentenceDetector, + tokenizer, + posTagger, + dependencyParser, + typedDependencyParser +]) + +# Sample input data (a DataFrame with one text example) +data = {"text": ["Dependencies represent relationships between words in a sentence."]} +df = spark.createDataFrame(data) + +# Run the pipeline on the input data +result = pipeline.fit(df).transform(df) + +# Show the dependency parsing results +result.select("dependency.result").show(truncate=False) + ++---------------------------------------------------------------------------------+ +|result | ++---------------------------------------------------------------------------------+ +|[ROOT, Dependencies, represents, words, relationships, Sentence, Sentence, words]| ++---------------------------------------------------------------------------------+ +``` +```scala +import com.johnsnowlabs.nlp.DocumentAssembler +import com.johnsnowlabs.nlp.annotator._ +import org.apache.spark.ml.Pipeline +import spark.implicits._ + +// Document Assembler: Converts raw text into a document format for NLP processing +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +// Sentence Detector: Splits the input text into individual sentences +val sentenceDetector = new SentenceDetector() + .setInputCols(Array("document")) + .setOutputCol("sentence") + +// Tokenizer: Breaks sentences into individual tokens (words) +val tokenizer = new Tokenizer() + .setInputCols(Array("sentence")) + .setOutputCol("token") + +// Part-of-Speech Tagger: Tags each token with its respective part of speech (pretrained model) +val posTagger = PerceptronModel.pretrained() + .setInputCols(Array("token", "sentence")) + .setOutputCol("pos") + +// Dependency Parser: Analyzes the grammatical structure of the sentence +val dependencyParser = DependencyParserModel.pretrained() + .setInputCols(Array("sentence", "pos", "token")) + .setOutputCol("dependency") + +// Typed Dependency Parser: Assigns typed labels to the dependencies +val typedDependencyParser = TypedDependencyParserModel.pretrained() + .setInputCols(Array("token", "pos", "dependency")) + .setOutputCol("labdep") + +// Create a pipeline that includes all stages +val pipeline = new Pipeline().setStages(Array( + documentAssembler, + sentenceDetector, + tokenizer, + posTagger, + dependencyParser, + typedDependencyParser +)) + +// Sample input data (a DataFrame with one text example) +val df = Seq("Dependencies represent relationships between words in a Sentence").toDF("text") + +// Run the pipeline on the input data +val result = pipeline.fit(df).transform(df) + +// Show the dependency parsing results +result.select("dependency.result").show(truncate = false) + ++---------------------------------------------------------------------------------+ +|result | ++---------------------------------------------------------------------------------+ +|[ROOT, Dependencies, represents, words, relationships, Sentence, Sentence, words]| ++---------------------------------------------------------------------------------+ +``` +
+ +## Try Real-Time Demos! + +If you want to see the outputs of dependency parsing models in real time, visit our interactive demos: + +- **[Grammar Analysis & Dependency Parsing](https://huggingface.co/spaces/abdullahmubeen10/sparknlp-grammar-analysis-and-dependency-parsing){:target="_blank"}** โ€“ An interactive demo to visualize dependencies in sentences. + +## Useful Resources + +Want to dive deeper into dependency parsing with Spark NLP? Here are some curated resources to help you get started and explore further: + +**Articles and Guides** +- *[Mastering Dependency Parsing with Spark NLP and Python](https://www.johnsnowlabs.com/supercharge-your-nlp-skills-mastering-dependency-parsing-with-spark-nlp-and-python/){:target="_blank"}* + +**Notebooks** +- *[Extract Part of speech tags and perform dependency parsing on a text](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/streamlit_notebooks/GRAMMAR_EN.ipynb#scrollTo=syePZ-1gYyj3){:target="_blank"}* +- *[Typed Dependency Parsing with NLU.](https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/nlu/colab/component_examples/dependency_parsing/NLU_typed_dependency_parsing_example.ipynb){:target="_blank"}* diff --git a/docs/en/tasks/image_captioning.md b/docs/en/tasks/image_captioning.md new file mode 100644 index 00000000000000..4317729b8b3fa7 --- /dev/null +++ b/docs/en/tasks/image_captioning.md @@ -0,0 +1,151 @@ +--- +layout: docs +header: true +seotitle: +title: Image Captioning +permalink: docs/en/tasks/image_captioning +key: docs-tasks-image-captioning +modify_date: "2024-10-05" +show_nav: true +sidebar: + nav: sparknlp +--- + +**Image Captioning** is the process of generating descriptive text for an image based on its visual content. This task is crucial in computer vision and has a variety of applications, such as enhancing accessibility for visually impaired individuals, improving image search, and enriching multimedia content. Spark NLP integrates image captioning with other NLP and vision-based tasks, enabling efficient and scalable caption generation within the same framework. + +By utilizing image captioning models, we can produce natural language descriptions that capture the key elements and context of images. Common use cases include: + +- **Social Media**: Automatically generating captions for user-uploaded images. +- **E-Commerce**: Generating product descriptions based on visual attributes. +- **Accessibility**: Describing visual content for the visually impaired. +- **Search Engines**: Improving search results by associating images with relevant text. + +## Picking a Model + +When selecting a model for image captioning, itโ€™s important to consider the **image complexity** and the **quality of captions** required. For example, some tasks may need simple, high-level descriptions (e.g., "a person riding a bike"), while others might require more detailed, context-rich captions (e.g., "a young man riding a mountain bike on a sunny day"). + +Additionally, assess the **performance metrics** such as **BLEU score** or **ROUGE score** for evaluating the quality of generated captions. Ensure that the model is well-suited to your specific dataset, whether it consists of simple images like products or more complex images like natural scenes. + +Explore pre-trained image captioning models in the [Spark NLP Models Hub](https://sparknlp.org/models) for a variety of datasets and tasks. + +#### Recommended Models for Image Captioning +- **VisionEncoderDecoder For Image Captioning:** This model can be used for generating descriptive captions based on images. It utilizes a transformer-based architecture, providing high-quality captions for various types of images. Check out the pre-trained model [`image-captioning-vit-gpt2`](https://sparknlp.org/2023/09/20/image_captioning_vit_gpt2_en.html){:target="_blank"}. + +## How to use + +
+{% include programmingLanguageSelectScalaPython.html %} +```python +# Import necessary libraries from Spark NLP +import sparknlp +from sparknlp.base import * +from sparknlp.annotator import * +from pyspark.ml import Pipeline + +# Load image data into a DataFrame, discarding any invalid images +imageDF = spark.read \ + .format("image") \ + .option("dropInvalid", value=True) \ + .load("src/test/resources/image/") + +# Create an ImageAssembler to prepare image data for processing +imageAssembler = ImageAssembler() \ + .setInputCol("image") \ + .setOutputCol("image_assembler") + +# Initialize the VisionEncoderDecoder model for image captioning +imageCaptioning = VisionEncoderDecoderForImageCaptioning \ + .pretrained() \ # Load a pre-trained model for image captioning + .setBeamSize(2) + .setDoSample(False) + .setInputCols(["image_assembler"]) \ + .setOutputCol("caption") + +# Create a pipeline that includes the image assembler and image captioning stages +pipeline = Pipeline().setStages([imageAssembler, imageCaptioning]) + +# Fit the pipeline on the image DataFrame and transform the data +pipelineDF = pipeline.fit(imageDF).transform(imageDF) + +# Select and display the image file name and the generated captions +pipelineDF \ + .selectExpr("reverse(split(image.origin, '/'))[0] as image_name", "caption.result") \ + .show(truncate=False) + ++-----------------+---------------------------------------------------------+ +|image_name |result | ++-----------------+---------------------------------------------------------+ +|palace.JPEG |[a large room filled with furniture and a large window] | +|egyptian_cat.jpeg|[a cat laying on a couch next to another cat] | +|hippopotamus.JPEG|[a brown bear in a body of water] | +|hen.JPEG |[a flock of chickens standing next to each other] | +|ostrich.JPEG |[a large bird standing on top of a lush green field] | +|junco.JPEG |[a small bird standing on a wet ground] | +|bluetick.jpg |[a small dog standing on a wooden floor] | +|chihuahua.jpg |[a small brown dog wearing a blue sweater] | +|tractor.JPEG |[a man is standing in a field with a tractor] | +|ox.JPEG |[a large brown cow standing on top of a lush green field]| ++-----------------+---------------------------------------------------------+ +``` +```scala +// Import necessary libraries from Spark NLP +import com.johnsnowlabs.nlp.annotator._ +import com.johnsnowlabs.nlp.ImageAssembler +import org.apache.spark.ml.Pipeline + +// Load image data into a DataFrame, discarding invalid images +val imageDF: DataFrame = spark.read + .format("image") + .option("dropInvalid", value = true) + .load("src/test/resources/image/") + +// Image Assembler: Prepares image data for processing +val imageAssembler = new ImageAssembler() + .setInputCol("image") + .setOutputCol("image_assembler") + +// Initialize image captioning model +val imageCaptioning = VisionEncoderDecoderForImageCaptioning + .pretrained() + .setBeamSize(2) + .setDoSample(false) + .setInputCols("image_assembler") + .setOutputCol("caption") + +// Create and fit the pipeline +val pipeline = new Pipeline().setStages(Array(imageAssembler, imageCaptioning)) +val pipelineDF = pipeline.fit(imageDF).transform(imageDF) + +// Display image names and generated captions +pipelineDF + .selectExpr("reverse(split(image.origin, '/'))[0] as image_name", "caption.result") + .show(truncate = false) + ++-----------------+---------------------------------------------------------+ +|image_name |result | ++-----------------+---------------------------------------------------------+ +|palace.JPEG |[a large room filled with furniture and a large window] | +|egyptian_cat.jpeg|[a cat laying on a couch next to another cat] | +|hippopotamus.JPEG|[a brown bear in a body of water] | +|hen.JPEG |[a flock of chickens standing next to each other] | +|ostrich.JPEG |[a large bird standing on top of a lush green field] | +|junco.JPEG |[a small bird standing on a wet ground] | +|bluetick.jpg |[a small dog standing on a wooden floor] | +|chihuahua.jpg |[a small brown dog wearing a blue sweater] | +|tractor.JPEG |[a man is standing in a field with a tractor] | +|ox.JPEG |[a large brown cow standing on top of a lush green field]| ++-----------------+---------------------------------------------------------+ +``` + +## Try Real-Time Demos! + +Explore real-time image captioning outputs with our interactive demos: + +- **[VisionEncoderDecoder For Image Captioning](https://huggingface.co/spaces/abdullahmubeen10/sparknlp-VisionEncoderDecoderForImageCaptioning){:target="_blank"}** + +## Useful Resources + +To dive deeper into image captioning using Spark NLP, check out these useful resources: + +**Notebooks** +- *[Vision Encoder-Decoder for Image Captioning](https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/image/VisionEncoderDecoderForImageCaptioning.ipynb){:target="_blank"}* diff --git a/docs/en/tasks/image_classification.md b/docs/en/tasks/image_classification.md new file mode 100644 index 00000000000000..b0a5cdcc07eec5 --- /dev/null +++ b/docs/en/tasks/image_classification.md @@ -0,0 +1,154 @@ +--- +layout: docs +header: true +seotitle: +title: Image Classification +permalink: docs/en/tasks/image_classification +key: docs-tasks-image-classification +modify_date: "2024-09-26" +show_nav: true +sidebar: + nav: sparknlp +--- + +**Image classification** is the process of assigning a **label** or **category** to an image based on its visual content. This task is fundamental in the field of computer vision and has numerous applications, from facial recognition to product classification in e-commerce. Spark NLP provides tools that make it easier to integrate image classification into your data pipelines, allowing for scalable, efficient image processing within the same framework. + +By using image classification models, we can analyze and classify images into predefined categories based on patterns and features in the image data. Some common use cases include: + +- Classifying product images into categories like **clothing**, **electronics**, **furniture**, etc. +- Recognizing objects in images, such as identifying animals, vehicles, or various types of landscapes. +- Detecting facial expressions and other human features for tasks like emotion analysis or identity verification. + +## Picking a Model + +When selecting a model for image classification, itโ€™s essential to consider several factors that ensure optimal performance for your specific use case. Start by evaluating the **type of images** you are working with, such as grayscale vs. colored, high-resolution vs. low-resolution, or simple vs. complex visual patterns. Determine whether your task requires **binary classification** (e.g., cat vs. dog) or **multiclass classification** (e.g., classifying various animal species), as the right model choice depends on the complexity of the task. + +Next, assess the **computational power** available to you. Complex models such as CNNs (Convolutional Neural Networks) can be resource-intensive but deliver highly accurate results. Simpler models may be sufficient for less demanding tasks. Ensure the model's **performance metrics** (accuracy, precision, recall) align with your project goals, and consider the **interpretability** of the modelโ€”more advanced models may be less interpretable but offer greater accuracy. + +Explore a wide variety of image classification models on the [Spark NLP Models](https://sparknlp.org/models), where you can find pre-trained models suited for different tasks and datasets. + +#### Recommended Models for Specific Image Classification Tasks +- **Object Detection:** For detecting objects in images, models such as [`image_classifier_vit_base_patch16_224`](https://sparknlp.org/2022/08/10/image_classifier_vit_base_patch16_224_en_3_0.html){:target="_blank"} can be used to detect objects across multiple categories. +- **Facial Expression Recognition:** Models like [`image_classifier_swin_swin_large_patch4_window12_384`](https://sparknlp.org/2023/03/23/pipeline_image_classifier_swin_swin_large_patch4_window12_384_en.html){:target="_blank"} are great for tasks that involve recognizing facial emotions. +- **Scene Classification:** To classify scenes into categories like **urban**, **rural**, or **forest**, models like [`image_classifier_vit_base_patch16_224`](https://sparknlp.org/2022/08/10/image_classifier_vit_base_patch16_224_en_3_0.html){:target="_blank"} can be applied effectively. + +By carefully considering your data, task requirements, and available resources, you can make an informed decision and leverage the best models for your image classification needs. + +## How to use + +
+{% include programmingLanguageSelectScalaPython.html %} +```python +import sparknlp +from sparknlp.base import * +from sparknlp.annotator import * +from pyspark.ml import Pipeline + +# Load image data into a DataFrame, discarding any invalid images +imageDF = spark.read \ + .format("image") \ + .option("dropInvalid", value=True) \ + .load("src/test/resources/image/") + +# Image Assembler: Prepares image data for processing +imageAssembler = ImageAssembler() \ + .setInputCol("image") \ + .setOutputCol("image_assembler") + +# ViTForImageClassification: Pretrained Vision Transformer model for image classification +imageClassifier = ViTForImageClassification \ + .pretrained() \ + .setInputCols(["image_assembler"]) \ + .setOutputCol("class") + +# Create a pipeline with image assembler and classifier stages +pipeline = Pipeline().setStages([imageAssembler, imageClassifier]) + +# Fit the pipeline on the image DataFrame and transform the data +pipelineDF = pipeline.fit(imageDF).transform(imageDF) + +# Select and display the image file name and the classification result +pipelineDF \ + .selectExpr("reverse(split(image.origin, '/'))[0] as image_name", "class.result") \ + .show(truncate=False) + ++-----------------+----------------------------------------------------------+ +|image_name |result | ++-----------------+----------------------------------------------------------+ +|palace.JPEG |[palace] | +|egyptian_cat.jpeg|[Egyptian cat] | +|hippopotamus.JPEG|[hippopotamus, hippo, river horse, Hippopotamus amphibius]| +|hen.JPEG |[hen] | +|ostrich.JPEG |[ostrich, Struthio camelus] | +|junco.JPEG |[junco, snowbird] | +|bluetick.jpg |[bluetick] | +|chihuahua.jpg |[Chihuahua] | +|tractor.JPEG |[tractor] | +|ox.JPEG |[ox] | ++-----------------+----------------------------------------------------------+ +``` +```scala +import com.johnsnowlabs.nlp.annotator._ +import com.johnsnowlabs.nlp.ImageAssembler +import org.apache.spark.ml.Pipeline + +// Load image data into a DataFrame, discarding invalid images +val imageDF: DataFrame = spark.read + .format("image") + .option("dropInvalid", value = true) + .load("src/test/resources/image/") + +// Image Assembler: Prepares image data for further processing +val imageAssembler = new ImageAssembler() + .setInputCol("image") + .setOutputCol("image_assembler") + +// Pretrained ViT model for image classification +val imageClassifier = ViTForImageClassification + .pretrained() + .setInputCols("image_assembler") + .setOutputCol("class") + +// Create a pipeline with the image assembler and classifier stages +val pipeline = new Pipeline().setStages(Array(imageAssembler, imageClassifier)) + +// Fit the pipeline on the image DataFrame and apply transformations +val pipelineDF = pipeline.fit(imageDF).transform(imageDF) + +// Select and display the image name and the classification result +pipelineDF + .selectExpr("reverse(split(image.origin, '/'))[0] as image_name", "class.result") + .show(truncate = false) + ++-----------------+----------------------------------------------------------+ +|image_name |result | ++-----------------+----------------------------------------------------------+ +|palace.JPEG |[palace] | +|egyptian_cat.jpeg|[Egyptian cat] | +|hippopotamus.JPEG|[hippopotamus, hippo, river horse, Hippopotamus amphibius]| +|hen.JPEG |[hen] | +|ostrich.JPEG |[ostrich, Struthio camelus] | +|junco.JPEG |[junco, snowbird] | +|bluetick.jpg |[bluetick] | +|chihuahua.jpg |[Chihuahua] | +|tractor.JPEG |[tractor] | +|ox.JPEG |[ox] | ++-----------------+----------------------------------------------------------+ +``` + +## Try Real-Time Demos! + +If you want to explore real-time image classification outputs, visit our interactive demos: + +- **[Swin For Image Classification](https://huggingface.co/spaces/abdullahmubeen10/sparknlp-SwinForImageClassification){:target="_blank"}** +- **[VisionEncoderDecoder For Image Captioning](https://huggingface.co/spaces/abdullahmubeen10/sparknlp-VisionEncoderDecoderForImageCaptioning){:target="_blank"}** +- **[Object Detection & Scene Classification](https://nlp.johnsnowlabs.com/detect_objects_scenes){:target="_blank"}** +- **[ConvNext For Image Classification](https://huggingface.co/spaces/abdullahmubeen10/sparknlp-ConvNextForImageClassification){:target="_blank"}** + +## Useful Resources + +To dive deeper into image classification using Spark NLP, check out these useful resources: + +**Notebooks** +- *[Image Classification Notebooks in SparkNLP](https://github.com/JohnSnowLabs/spark-nlp/tree/master/examples/python/annotation/image){:target="_blank"}* +- *[ViT for Image Classification with Transformers](https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Public/19.Image_Classification.ipynb){:target="_blank"}* diff --git a/docs/en/tasks/landing_page.md b/docs/en/tasks/landing_page.md new file mode 100644 index 00000000000000..27a8668e0a826a --- /dev/null +++ b/docs/en/tasks/landing_page.md @@ -0,0 +1,171 @@ +--- +layout: docs +header: true +seotitle: Tasks +title: Spark NLP - Tasks +permalink: docs/en/tasks/landing_page +key: docs-tasks +modify_date: "2024-09-26" +show_nav: true +sidebar: + nav: sparknlp +--- + +
+ +Spark NLP is the central hub for all your State of the Art Natural Language Processing needs. Whether you're looking for demos, use cases, models, or datasets, you'll find the resources you need to begin any NLP task right here! + +## Natural Language Processing +
+ +
+ +## Text Classification + +Text classification is the process of automatically categorizing text into predefined labels or categories based on its content. + +{:.btn-block} +[Learn More](text_classification){:.button.button--info.button--rounded.button--md} + +
+ +## Token Classification + +Token classification is the process of assigning labels to individual tokens (words or subwords) in a text, commonly used for tasks like named entity recognition or part-of-speech tagging. + +{:.btn-block} +[Learn More](token_classification){:.button.button--primary.button--rounded.button--md} + +
+ +
+ +## Zero-Shot Classification + +Zero-shot classification is the process of categorizing text into labels without the model having seen any examples of those labels during training, using general knowledge and context. + +{:.btn-block} +[Learn More](zero_shot_classification){:.button.button--primary.button--rounded.button--md} + +
+ +## Text Generation + +Text generation is the process of automatically creating coherent and contextually relevant text based on a given input or prompt using machine learning models. + +{:.btn-block} +[Learn More](text_generation){:.button.button--primary.button--rounded.button--md} + +
+ +
+ +## Question Answering + +Question answering models can retrieve answers from a given text, making them useful for searching documents. Some models can even generate answers independently, without needing any context! + +{:.btn-block} +[Learn More](question_answering){:.button.button--primary.button--rounded.button--md} + +
+ +## Table Question Answering + +Table question answering models can extract answers from structured data in tables, making it easy to query and retrieve specific information. + +{:.btn-block} +[Learn More](table_question_answering){:.button.button--primary.button--rounded.button--md} + +
+ +
+ +## Summarization + +Summarization models condense long texts into shorter versions, capturing the main ideas and key points while maintaining the overall meaning of the original content. + +{:.btn-block} +[Learn More](summarization){:.button.button--primary.button--rounded.button--md} + +
+ +## Translation + +Translation models automatically convert text from one language to another while preserving the meaning and context of the original content. + +{:.btn-block} +[Learn More](translation){:.button.button--primary.button--rounded.button--md} + +
+ +
+ +## Text Preprocessing + +Text Preprocessing is the task of cleaning and transforming raw text into a format suitable for NLP tasks. This includes steps like tokenization, lowercasing, removing stop words, and stemming or lemmatization to prepare text for analysis. + +{:.btn-block} +[Learn More](text_preprocessing){:.button.button--primary.button--rounded.button--md} + +
+ +## Dependency Parsing + +Dependency Parsing is a syntactic analysis method that examines the grammatical structure of a sentence by identifying the dependencies between its words. It illustrates how words relate to each other through a dependency tree or graph, where some words act as "parents" and others as "children." + +{:.btn-block} +[Learn More](dependency_parsing){:.button.button--primary.button--rounded.button--md} + +
+ +
+ +## Computer Vision +
+ +
+ +## Image Classification + +Image classification models automatically categorize images into predefined labels or classes based on their visual content. + +{:.btn-block} +[Learn More](image_classification){:.button.button--primary.button--rounded.button--md} + +
+ +## Image Captioning + +Image captioning models generate descriptive text for images, providing context and details about the visual content they depict. + +{:.btn-block} +[Learn More](image_captioning){:.button.button--primary.button--rounded.button--md} + +
+ +
+ +## Zero-Shot Image Classification + +Zero-shot image classification is the process of categorizing images into labels without the model having seen any examples of those labels during training, using general knowledge and context. + +{:.btn-block} +[Learn More](zero_shot_image_classification){:.button.button--primary.button--rounded.button--md} + +
+ +
+ +## Audio +
+ +
+ +## Automatic Speech Recognition + +Automatic speech recognition (ASR) is the process of converting spoken language into written text. + +{:.btn-block} +[Learn More](automatic_speech_recognition){:.button.button--primary.button--rounded.button--md} + +
\ No newline at end of file diff --git a/docs/en/tasks/question_answering.md b/docs/en/tasks/question_answering.md new file mode 100644 index 00000000000000..028c092b6996f5 --- /dev/null +++ b/docs/en/tasks/question_answering.md @@ -0,0 +1,156 @@ +--- +layout: docs +header: true +seotitle: +title: Question Answering +permalink: docs/en/tasks/question_answering +key: docs-tasks-question-answering +modify_date: "2024-09-28" +show_nav: true +sidebar: + nav: sparknlp +--- + +**Question Answering (QA)** is the task of automatically answering questions posed by humans in natural language. It is a fundamental problem in *natural language processing (NLP)*, playing a vital role in applications such as search engines, virtual assistants, customer support systems, and more. Spark NLP provides state-of-the-art (SOTA) models for QA tasks, enabling accurate and context-aware responses to user queries. + +QA systems extract relevant information from a given context or knowledge base to answer a question. Depending on the model and input, they can either find exact answers within a text or generate a more comprehensive response. + +## Types of Question Answering + +- **Open-Book QA:** In this approach, the model has access to external documents, passages, or knowledge sources to extract the answer. The system looks for relevant information within the provided text (e.g., "What is the tallest mountain in the world?" answered using a document about mountains). + +- **Closed-Book QA:** Here, the model must rely solely on the knowledge it has been trained on, without access to external sources. The answer is generated from the model's internal knowledge (e.g., answering trivia questions without referring to external material). + +Common use cases include: + +- **Fact-based QA:** Answering factoid questions such as "What is the capital of France?" +- **Reading Comprehension:** Extracting answers from a provided context, often used in assessments or educational tools. +- **Dialogue-based QA:** Supporting interactive systems that maintain context across multiple turns of conversation. + +By leveraging QA models, organizations can build robust systems that improve user engagement, provide instant information retrieval, and offer customer support in a more intuitive manner. + + + +## Picking a Model + +When selecting a model for question answering, consider the following important factors. First, assess the **nature of your data** (e.g., structured knowledge base vs. unstructured text) and the **type of QA** needed (open-book or closed-book). Open-book QA requires models that can efficiently search and extract from external sources, while closed-book QA demands models with a large internal knowledge base. + +Evaluate the **complexity of the questions**โ€”are they simple factoids or require more reasoning and multi-turn interactions? Metrics such as **Exact Match (EM)** and **F1 score** are commonly used to measure model performance in QA tasks. Finally, take into account the **computational resources** available, as some models, like BERT or T5, may require significant processing power. + +Explore models tailored for question answering at [Spark NLP Models](https://sparknlp.org/models), where youโ€™ll find various options for different QA tasks. + +#### Recommended Models for Specific QA Tasks + +- **Extractive QA:** Use models like [`distilbert-base-cased-distilled-squad`](https://sparknlp.org/2023/11/26/distilbert_base_cased_qa_squad2_en.html){:target="_blank"} and [`bert-large-uncased-whole-word-masking-finetuned-squad`](https://sparknlp.org/2024/09/01/bert_large_uncased_whole_word_masking_finetuned_squad_google_bert_en.html){:target="_blank"} for extracting answers directly from a provided context. +- **Generative QA (Closed-Book):** Consider models such as [`roberta-base-squad2`](https://sparknlp.org/2022/12/02/roberta_qa_deepset_base_squad2_en.html){:target="_blank"} or [`t5_base`](https://sparknlp.org/2021/01/08/t5_base_en.html){:target="_blank"} for generating answers based on internal knowledge without external context. + +By selecting the appropriate question answering model, you can enhance your ability to deliver accurate and relevant answers tailored to your specific NLP tasks. + +## How to use + +
+{% include programmingLanguageSelectScalaPython.html %} +```python +import sparknlp +from sparknlp.base import * +from sparknlp.annotator import * +from pyspark.ml import Pipeline + +# 1. Document Assembler: Prepares the question and context text for further processing +documentAssembler = MultiDocumentAssembler() \ + .setInputCols(["question", "context"]) \ + .setOutputCol(["document_question", "document_context"]) + +# 2. Question Answering Model: Uses a pretrained RoBERTa model for QA +spanClassifier = RoBertaForQuestionAnswering.pretrained() \ + .setInputCols(["document_question", "document_context"]) \ + .setOutputCol("answer") \ + .setCaseSensitive(False) + +# 3. Pipeline: Combines the stages (DocumentAssembler and RoBERTa model) into a pipeline +pipeline = Pipeline().setStages([ + documentAssembler, + spanClassifier +]) + +# 4. Sample Data: Creating a DataFrame with a question and context +data = spark.createDataFrame([["What's my name?", "My name is Clara and I live in Berkeley."]]).toDF("question", "context") + +# 5. Running the Pipeline: Fitting the pipeline to the data and generating answers +result = pipeline.fit(data).transform(data) + +# 6. Displaying the Result: The output is the answer to the question extracted from the context +result.select("answer.result").show(truncate=False) + ++--------------------+ +|result | ++--------------------+ +|[Clara] | ++--------------------+ +``` + +```scala +import spark.implicits._ +import com.johnsnowlabs.nlp.base._ +import com.johnsnowlabs.nlp.annotator._ +import org.apache.spark.ml.Pipeline + +// 1. Document Assembler: Prepares the question and context text for further processing +val document = new MultiDocumentAssembler() + .setInputCols("question", "context") + .setOutputCols("document_question", "document_context") + +// 2. Question Answering Model: Uses a pretrained RoBERTa model for QA +val questionAnswering = RoBertaForQuestionAnswering.pretrained() + .setInputCols(Array("document_question", "document_context")) + .setOutputCol("answer") + .setCaseSensitive(true) + +// 3. Pipeline: Combines the stages (DocumentAssembler and RoBERTa model) into a pipeline +val pipeline = new Pipeline().setStages(Array( + document, + questionAnswering +)) + +// 4. Sample Data: Creating a DataFrame with a question and context +val data = Seq("What's my name?", "My name is Clara and I live in Berkeley.").toDF("question", "context") + +// 5. Running the Pipeline: Fitting the pipeline to the data and generating answers +val result = pipeline.fit(data).transform(data) + +// 6. Displaying the Result: The output is the answer to the question extracted from the context +result.select("answer.result").show(false) + ++---------------------+ +|result | ++---------------------+ +|[Clara] | ++---------------------+ +``` +
+ +## Try Real-Time Demos! + +If you want to see the outputs of question answering models in real time, visit our interactive demos: + +- **[BERT for Extractive Question Answering](https://huggingface.co/spaces/abdullahmubeen10/sparknlp-bert-qa){:target="_blank"}** โ€“ Extract answers directly from provided context using the BERT model. +- **[RoBERTa for Question Answering](https://huggingface.co/spaces/abdullahmubeen10/sparknlp-roberta-qa){:target="_blank"}** โ€“ Use RoBERTa for advanced extractive question answering tasks. +- **[T5 for Abstractive Question Answering](https://huggingface.co/spaces/abdullahmubeen10/sparknlp-t5-qa){:target="_blank"}** โ€“ Generate abstractive answers using Google's T5 model. +- **[Multihop QA with BERT](https://sparknlp.org/question_answering){:target="_blank"}** โ€“ Perform complex multihop question answering by reasoning over multiple pieces of text. + +## Useful Resources + +Want to dive deeper into question answering with Spark NLP? Here are some curated resources to help you get started and explore further: + +**Articles and Guides** +- *[Empowering NLP with Spark NLP and T5 Model: Text Summarization and Question Answering](https://www.johnsnowlabs.com/empowering-nlp-with-spark-nlp-and-t5-model-text-summarization-and-question-answering/){:target="_blank"}* +- *[Question Answering in Visual NLP: A Picture is Worth a Thousand Answers](https://medium.com/spark-nlp/question-answering-in-visual-nlp-a-picture-is-worth-a-thousand-answers-535bbcb53d3c){:target="_blank"}* +- *[Spark NLP: Unlocking the Power of Question Answering](https://medium.com/john-snow-labs/spark-nlp-unlocking-the-power-of-question-answering-e5a60f925368){:target="_blank"}* + +**Notebooks** +- *[Question Answering Transformers in Spark NLP](https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Public/22.0_Llama2_Transformer_In_SparkNLP.ipynb){:target="_blank"}* +- *[Question Answering and Summarization with T5](https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Public/10.Question_Answering_and_Summarization_with_T5.ipynb){:target="_blank"}* +- *[Question Answering in Spark NLP](https://github.com/JohnSnowLabs/spark-nlp/tree/master/examples/python/annotation/text/english/question-answering){:target="_blank"}* +- *[T5 Workshop with Spark NLP](https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Public/10.1_T5_Workshop_with_Spark_NLP.ipynb){:target="_blank"}* diff --git a/docs/en/tasks/summarization.md b/docs/en/tasks/summarization.md new file mode 100644 index 00000000000000..f1a5474fea01cc --- /dev/null +++ b/docs/en/tasks/summarization.md @@ -0,0 +1,150 @@ +--- +layout: docs +header: true +seotitle: +title: Summarization +permalink: docs/en/tasks/summarization +key: docs-tasks-summarization +modify_date: "2024-09-28" +show_nav: true +sidebar: + nav: sparknlp +--- + +**Summarization** is the task of generating concise and informative summaries from longer documents. This is useful for a wide range of applications, such as summarizing news articles, legal documents, or any large texts where key points need to be extracted. Spark NLP offers advanced summarization models that can create high-quality summaries efficiently. + +Summarization models take input text and generate shorter versions while preserving essential information. Common use cases include: + +- **News Summaries:** Automatically condensing long news articles into brief, digestible summaries. +- **Legal Documents:** Summarizing lengthy contracts, case studies, or legal opinions. +- **Research Papers:** Extracting key insights and conclusions from scientific papers. + +By leveraging summarization models, organizations can efficiently process large amounts of textual data and extract critical information, making it easier to consume and understand complex documents. + +## Picking a Model + +When choosing a summarization model, consider factors like the **length of the input text** and the **desired summary style** (e.g., extractive or abstractive). Some models are better suited for shorter inputs, while others excel in handling long documents. Evaluate whether your task requires **sentence-level summaries** or **paragraph-level condensation**. + +Consider the **domain** of the text, such as legal, scientific, or general news, as domain-specific models often perform better. Explore the available summarization models at [Spark NLP Models](https://sparknlp.org/models) to find the one that best suits your summarization needs. + +#### Recommended Models for Summarization Tasks + +- **General Summarization:** For most summarization tasks, consider models like [`bart-large-cnn`](https://sparknlp.org/2023/05/11/bart_large_cnn_en.html){:target="_blank"} and [`t5-base`](https://sparknlp.org/2021/01/08/t5_base_en.html){:target="_blank"} are well suited for generating concise summaries. + +By selecting the right model, you can efficiently condense long documents into meaningful summaries, saving time and effort. + +## How to use + +
+{% include programmingLanguageSelectScalaPython.html %} +```python +import sparknlp +from sparknlp.base import * +from sparknlp.annotator import * +from pyspark.ml import Pipeline + +# Step 1: Assemble raw text data into a format that Spark NLP can process +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("documents") + +# Step 2: Load a pretrained BART model for summarization +bart = BartTransformer.pretrained("distilbart_xsum_12_6") \ + .setTask("summarize:") \ + .setInputCols(["documents"]) \ + .setMaxOutputLength(200) \ + .setOutputCol("summaries") + +# Step 3: Create a pipeline with the document assembler and BART model +pipeline = Pipeline().setStages([documentAssembler, bart]) + +# Step 4: Sample data - a long text passage for summarization +data = spark.createDataFrame([[ + "Transfer learning, where a model is first pre-trained on a data-rich task before being fine-tuned on a " + + "downstream task, has emerged as a powerful technique in natural language processing (NLP). The effectiveness" + + " of transfer learning has given rise to a diversity of approaches, methodology, and practice. In this " + + "paper, we explore the landscape of transfer learning techniques for NLP by introducing a unified framework " + + "that converts all text-based language problems into a text-to-text format. Our systematic study compares " + + "pre-training objectives, architectures, unlabeled data sets, transfer approaches, and other factors on dozens " + + "of language understanding tasks. By combining the insights from our exploration with scale and our new " + + "Colossal Clean Crawled Corpus, we achieve state-of-the-art results on many benchmarks covering " + + "summarization, question answering, text classification, and more. To facilitate future work on transfer " + + "learning for NLP, we release our data set, pre-trained models, and code." +]]).toDF("text") + +# Step 5: Apply the pipeline to generate the summary +result = pipeline.fit(data).transform(data) + +# Step 6: Display the summary +result.select("summaries.result").show(truncate=False) + +# +--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +# |result | +# +--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +# |[transfer learning has emerged as a powerful technique in natural language processing (NLP) the effectiveness of transfer learning has given rise to a diversity of approaches, | +# |methodologies, and practice .] | +# +--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ + +``` +```scala +import spark.implicits._ +import com.johnsnowlabs.nlp.base.DocumentAssembler +import com.johnsnowlabs.nlp.annotators.seq2seq.GPT2Transformer +import org.apache.spark.ml.Pipeline + +// Step 1: Document Assembler to prepare the text data +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("documents") + +// Step 2: Load BART model for text generation with customization +val bart = BartTransformer.pretrained("distilbart_xsum_12_6") + .setInputCols(Array("documents")) + .setMinOutputLength(10) + .setMaxOutputLength(30) + .setDoSample(true) + .setTopK(50) + .setOutputCol("generation") + +// Step 3: Define the pipeline stages +val pipeline = new Pipeline().setStages(Array(documentAssembler, bart)) + +// Step 4: Input text data to be summarized +val data = Seq( + "PG&E stated it scheduled the blackouts in response to forecasts for high winds " + + "amid dry conditions. The aim is to reduce the risk of wildfires. Nearly 800 thousand customers were " + + "scheduled to be affected by the shutoffs which were expected to last through at least midday tomorrow." +).toDF("text") + +// Step 5: Fit the model and apply the pipeline +val result = pipeline.fit(data).transform(data) + +// Step 6: Show the generated summary +results.select("generation.result").show(truncate = false) + +// +--------------------------------------------------------------+ +// |result | +// +--------------------------------------------------------------+ +// |[Nearly 800 thousand customers were affected by the shutoffs.]| +// +--------------------------------------------------------------+ + +``` +
+ +## Try Real-Time Demos! + +If you want to see the outputs of text classification models in real time, visit our interactive demos: + +- **[Sparknlp Text Summarization](https://huggingface.co/spaces/abdullahmubeen10/sparknlp-bert-annotators){:target="_blank"}** โ€“ A live demo where you can try your inputs on text classification models on the go. +- **[Text summarization](https://demo.johnsnowlabs.com/public/TEXT_SUMMARIZATION/){:target="_blank"}** โ€“ An interactive demo for sentiment and emotion detection. + +## Useful Resources + +Here are some resources to get you started with summarization in Spark NLP: + +**Articles and Guides** +- *[Empowering NLP with Spark NLP and T5 Model: Text Summarization and Question Answering](https://www.johnsnowlabs.com/empowering-nlp-with-spark-nlp-and-t5-model-text-summarization-and-question-answering/){:target="_blank"}* + +**Notebooks** +- **Document Summarization with BART** *[1](https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Public/08.Summarization_with_BART.ipynb){:target="_blank"}*, *[2](https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Public/08.Summarization_with_BART.ipynb){:target="_blank"}* +- *[T5 Workshop with Spark NLP](https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Public/10.1_T5_Workshop_with_Spark_NLP.ipynb){:target="_blank"}* \ No newline at end of file diff --git a/docs/en/tasks/table_question_answering.md b/docs/en/tasks/table_question_answering.md new file mode 100644 index 00000000000000..4729721b5e5a84 --- /dev/null +++ b/docs/en/tasks/table_question_answering.md @@ -0,0 +1,212 @@ +--- +layout: docs +header: true +seotitle: +title: Table Question Answering +permalink: docs/en/tasks/table_question_answering +key: docs-tasks-table-question-answering +modify_date: "2024-09-28" +show_nav: true +sidebar: + nav: sparknlp +--- + +**Table question answering** is the task of answering questions from structured tabular data. This is particularly useful for applications like financial reports, databases, and other contexts where information is stored in tables. Spark NLP provides state-of-the-art solutions for table question answering, enabling accurate extraction and generation of answers from tables in various formats. + +Table question answering models process tabular data and the question to output the most relevant answer. Common use cases include: + +- **Financial Reports:** Automatically extracting insights from financial data tables. +- **Databases:** Querying relational databases or spreadsheet data to extract specific information. +- **Business Intelligence:** Enabling non-technical users to interact with and extract data from complex tables using natural language. + +By leveraging table question answering, organizations can build systems capable of understanding tabular structures, making it easier to answer complex queries and automate data extraction. + +## Picking a Model + +When selecting a model for table question answering, consider factors such as the **complexity of the table** and the **nature of the query**. Some models work better with numerical data, while others may handle textual data or multi-row operations more effectively. + +Evaluate the **format of the tables** you are working with (e.g., CSV, Excel, or SQL tables), and ensure that the model can process the tabular structure accurately. Also, consider the **domain** of your tables, such as finance, healthcare, or retail, as some models may be pre-trained on specific domains. + +Explore models tailored for table question answering at [Spark NLP Models](https://sparknlp.org/models), where youโ€™ll find various options for different table QA tasks. + +#### Recommended Models for Specific Table Question Answering Tasks + +- **General Table QA:** Consider models such as [`tapas-large-finetuned-wtq`](https://sparknlp.org/2022/09/30/table_qa_tapas_large_finetuned_wtq_en.html){:target="_blank"} for answering questions across different types of tables. +- **SQL Query Generation:** Use models like [`t5-small-wikiSQL`](https://sparknlp.org/2022/05/31/t5_small_wikiSQL_en_3_0.html){:target="_blank"} to automatically generate SQL queries from natural language inputs. + +By selecting the right model for table question answering, you can extract valuable insights from structured data and answer complex queries efficiently. + +## How to use + +
+{% include programmingLanguageSelectScalaPython.html %} +```python +import sparknlp +from sparknlp.base import * +from sparknlp.annotator import * +from pyspark.ml import Pipeline + +# Document Assembler: Assembles table JSON and questions into documents +document_assembler = MultiDocumentAssembler()\ + .setInputCols("table_json", "questions")\ + .setOutputCols("document_table", "document_questions") + +# Sentence Detector: Splits the questions into individual sentences +sentence_detector = SentenceDetector()\ + .setInputCols(["document_questions"])\ + .setOutputCol("questions") + +# Table Assembler: Converts the table document to the proper format +table_assembler = TableAssembler()\ + .setInputCols(["document_table"])\ + .setOutputCol("table") + +# Tapas Model: Loads pretrained Tapas for table question answering +tapas = TapasForQuestionAnswering\ + .pretrained()\ + .setInputCols(["questions", "table"])\ + .setOutputCol("answers") + +# Pipeline: Combines all stages +pipeline = Pipeline(stages=[ + document_assembler, + sentence_detector, + table_assembler, + tapas +]) + +# Sample JSON data for the table +json_data = """ +{ + "header": ["name", "money", "age"], + "rows": [ + ["Donald Trump", "$100,000,000", "75"], + ["Elon Musk", "$20,000,000,000,000", "55"] + ] + } + """ + +# Fit and transform the data with the pipeline +model = pipeline.fit(data) +model\ + .transform(data)\ + .selectExpr("explode(answers) AS answer")\ + .select("answer.metadata.question", "answer.result")\ + .show(truncate=False) + +# Expected Output: +# +-----------------------+----------------------------------------+ +# |question |result | +# +-----------------------+----------------------------------------+ +# |Who earns 100,000,000? |Donald Trump | +# |Who has more money? |Elon Musk | +# |How much they all earn?|COUNT($100,000,000, $20,000,000,000,000)| +# |How old are they? |AVERAGE(75, 55) | +# +-----------------------+----------------------------------------+ +``` +```scala +import spark.implicits._ +import com.johnsnowlabs.nlp.base._ +import com.johnsnowlabs.nlp.annotator._ +import org.apache.spark.ml.Pipeline + +// Questions: Sample questions about the table data +val questions = + """ + |Who earns 100,000,000? + |Who has more money? + |How old are they? + |""".stripMargin.trim + +// Table Data: JSON format for table with name, money, and age columns +val jsonData = + """ + |{ + | "header": ["name", "money", "age"], + | "rows": [ + | ["Donald Trump", "$100,000,000", "75"], + | ["Elon Musk", "$20,000,000,000,000", "55"] + | ] + |} + |""".stripMargin.trim + +// DataFrame: Create DataFrame with table data and questions +val data = Seq((jsonData, questions)) + .toDF("json_table", "questions") + .repartition(1) + +// Document Assembler: Assemble the table JSON and questions into documents +val docAssembler = new MultiDocumentAssembler() + .setInputCols("json_table", "questions") + .setOutputCols("document_table", "document_questions") + +// Sentence Detector: Detects individual questions from the text +val sentenceDetector = SentenceDetectorDLModel + .pretrained() + .setInputCols(Array("document_questions")) + .setOutputCol("question") + +// Table Assembler: Converts JSON table data into table format +val tableAssembler = new TableAssembler() + .setInputFormat("json") + .setInputCols(Array("document_table")) + .setOutputCol("table") + +// Tapas Model: Pretrained model for table question answering +val tapas = TapasForQuestionAnswering + .pretrained() + .setInputCols(Array("question", "table")) + .setOutputCol("answer") + +// Pipeline: Combine all components into a pipeline +val pipeline = new Pipeline() + .setStages( + Array( + docAssembler, + sentenceDetector, + tableAssembler, + tapas)) + +// Model: Fit the pipeline to the data +val pipelineModel = pipeline.fit(data) +val result = pipeline.fit(data).transform(data) + +// Show Results: Explode answers and show the results for each question +result + .selectExpr("explode(answer) as answer") + .selectExpr( + "answer.metadata.question", + "answer.result") + +// Expected Output: +// +-----------------------+----------------------------------------+ +// |question |result | +// +-----------------------+----------------------------------------+ +// |Who earns 100,000,000? |Donald Trump | +// |Who has more money? |Elon Musk | +// |How much they all earn?|COUNT($100,000,000, $20,000,000,000,000)| +// |How old are they? |AVERAGE(75, 55) | +// +-----------------------+----------------------------------------+ +``` +
+ +## Try Real-Time Demos! + +If you want to see the outputs of table question answering models in real time, visit our interactive demos: + +- **[Tapas for Table Question Answering](https://huggingface.co/spaces/abdullahmubeen10/sparknlp-tapas){:target="_blank"}** โ€“ TAPAS answers questions from tabular data. +- **[Tapex for Table QA](https://huggingface.co/spaces/abdullahmubeen10/sparknlp-tapex){:target="_blank"}** โ€“ TAPEX handles complex table queries and computations. +- **[SQL Query Generation](https://huggingface.co/spaces/abdullahmubeen10/sparknlp-text-to-sql-t5){:target="_blank"}** โ€“ Converts natural language questions into SQL queries from tables. + +## Useful Resources + +Want to dive deeper into table question answering with Spark NLP? Here are some curated resources to help you get started and explore further: + +**Articles and Guides** +- *[Empowering NLP with Spark NLP and TAPAS Model: Table Question Answering](https://www.johnsnowlabs.com/empowering-nlp-with-spark-nlp-and-tapas-model-table-question-answering/){:target="_blank"}* +- *[Table-based Question Answering with Spark NLP](https://www.johnsnowlabs.com/table-based-question-answering-with-spark-nlp/){:target="_blank"}* + +**Notebooks** +- *[TAPAS Model for Table Question Answering](https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Public/11.Table_QA_with_TAPAS.ipynb){:target="_blank"}* +- *[SQL Code Generation from Tables](https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Public/10.2_SQL_Code_Generation_and_Style_Transfer_with_T5.ipynb){:target="_blank"}* +- *[TableQA with Spark NLP](https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Public/12.Table_QA_with_Tapex.ipynb){:target="_blank"}* diff --git a/docs/en/tasks/text_classification.md b/docs/en/tasks/text_classification.md new file mode 100644 index 00000000000000..7f643c60f5c67c --- /dev/null +++ b/docs/en/tasks/text_classification.md @@ -0,0 +1,164 @@ +--- +layout: docs +header: true +seotitle: +title: Text Classification +permalink: docs/en/tasks/text_classification +key: docs-tasks-text-classification +modify_date: "2024-09-26" +show_nav: true +sidebar: + nav: sparknlp +--- + +**Text classification** is the process of assigning a **category** or **label** to a piece of text, such as an email, tweet, or review. It plays a crucial role in *natural language processing (NLP)*, where it is used to automatically organize text into predefined categories. Spark NLP provides various solutions to address text classification challenges effectively. + +In this context, text classification involves analyzing a document's content to categorize it into one or more predefined groups. Common use cases include: + +- Organizing news articles into categories like **politics**, **sports**, **entertainment**, or **technology**. +- Conducting sentiment analysis, where customer reviews of products or services are classified as **positive**, **negative**, or **neutral**. + +By leveraging text classification, organizations can enhance their ability to process and understand large volumes of text data efficiently. + +
+ +
+ +## Picking a Model + +When selecting a model for text classification, itโ€™s crucial to evaluate several factors to ensure optimal performance for your specific use case. Start by analyzing the **nature of your data**, considering whether it is formal or informal and its length (e.g., tweets vs. reviews). Determine if your task requires **binary classification** (like spam detection) or **multiclass classification** (such as categorizing news topics), as some models excel in specific scenarios. + +Next, assess the **model complexity**; simpler models like Logistic Regression work well for straightforward tasks, while more complex models like BERT are suited for nuanced understanding. Consider the **availability of labeled data**โ€”larger datasets allow for training sophisticated models, whereas smaller datasets may benefit from pre-trained options. Define key **performance metrics** (e.g., accuracy, F1 score) to inform your choice, and ensure the model's interpretability meets your requirements. Finally, account for **resource constraints**, as advanced models will demand more memory and processing power. + +To explore and select from a variety of models, visit [Spark NLP Models](https://sparknlp.org/models), where you can find models tailored for different tasks and datasets. + + +#### Recommended Models for Specific Text Classification Tasks +- **Sentiment Analysis:** Use models specifically designed for sentiment detection, such as [`distilbert_sequence_classifier_sst2`](https://sparknlp.org/2021/11/21/distilbert_sequence_classifier_sst2_en.html){:target="_blank"}. +- **News Categorization:** Models like [`distilroberta-finetuned-financial-news-sentiment-analysis`](https://sparknlp.org/2023/11/29/roberta_sequence_classifier_distilroberta_finetuned_financial_news_sentiment_analysis_en.html){:target="_blank"} are ideal for classifying news articles into relevant categories. +- **Review Analysis:** For product reviews, consider using [`distilbert_base_uncased_finetuned_sentiment_amazon`](https://sparknlp.org/2023/11/18/distilbert_base_uncased_finetuned_sentiment_amazon_en.html){:target="_blank"} for more nuanced insights. + +If you have specific needs that are not covered by existing models, you can train your own model tailored to your unique requirements. Follow the guidelines provided in the [Spark NLP Training Documentation](https://sparknlp.org/docs/en/training) to get started on creating and training a model suited for your text classification task. + +By thoughtfully considering these factors and using the right models, you can enhance your NLP applications significantly. + +## How to use + +
+{% include programmingLanguageSelectScalaPython.html %} +```python +import sparknlp +from sparknlp.base import * +from sparknlp.annotator import * +from pyspark.ml import Pipeline + +# Assembling the document from the input text +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +# Tokenizing the text +tokenizer = Tokenizer() \ + .setInputCols(["document"]) \ + .setOutputCol("token") + +# Loading a pre-trained sequence classification model +# You can replace `BertForSequenceClassification.pretrained()` with your selected model +# For example: BertForSequenceClassification.pretrained("distilbert_sequence_classifier_sst2", "en") +sequenceClassifier = BertForSequenceClassification.pretrained() \ + .setInputCols(["token", "document"]) \ + .setOutputCol("label") \ + .setCaseSensitive(True) + +# Defining the pipeline with document assembler, tokenizer, and classifier +pipeline = Pipeline().setStages([ + documentAssembler, + tokenizer, + sequenceClassifier +]) + +# Creating a sample DataFrame +data = spark.createDataFrame([["I loved this movie when I was a child.", "It was pretty boring."]]).toDF("text") + +# Fitting the pipeline and transforming the data +result = pipeline.fit(data).transform(data) + +# Showing the classification result +result.select("label.result").show(truncate=False) + ++------+ +|result| ++------+ +|[pos] | +|[neg] | ++------+ +``` + +```scala +import spark.implicits._ +import com.johnsnowlabs.nlp.base._ +import com.johnsnowlabs.nlp.annotator._ +import org.apache.spark.ml.Pipeline + +// Step 1: Convert raw text into document format +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +// Step 2: Tokenize the document into words +val tokenizer = new Tokenizer() + .setInputCols("document") + .setOutputCol("token") + +// Step 3: Load a pre-trained BERT model for sequence classification +val sequenceClassifier = BertForSequenceClassification.pretrained() + .setInputCols("token", "document") + .setOutputCol("label") + .setCaseSensitive(true) + +// Step 4: Define the pipeline with stages for document assembly, tokenization, and classification +val pipeline = new Pipeline().setStages(Array( + documentAssembler, + tokenizer, + sequenceClassifier +)) + +// Step 5: Create sample data and apply the pipeline +val data = Seq("I loved this movie when I was a child.", "It was pretty boring.").toDF("text") +val result = pipeline.fit(data).transform(data) + +// Step 6: Show the classification results +result.select("label.result").show(false) + ++------+ +|result| ++------+ +|[pos] | +|[neg] | ++------+ +``` +
+ +## Try Real-Time Demos! + +If you want to see the outputs of text classification models in real time, visit our interactive demos: + +- **[BERT Annotators Demo](https://huggingface.co/spaces/abdullahmubeen10/sparknlp-bert-annotators){:target="_blank"}** โ€“ A live demo where you can try your inputs on text classification models on the go. +- **[Sentiment & Emotion Detection Demo](https://nlp.johnsnowlabs.com/detect_sentiment_emotion){:target="_blank"}** โ€“ An interactive demo for sentiment and emotion detection. + +## Useful Resources + +Want to dive deeper into text classification with Spark NLP? Here are some curated resources to help you get started and explore further: + +**Articles and Guides** +- *[Mastering Text Classification with Spark NLP](https://www.johnsnowlabs.com/mastering-text-classification-with-spark-nlp/){:target="_blank"}* +- *[Unlocking the Power of Sentiment Analysis with Deep Learning](https://www.johnsnowlabs.com/unlocking-the-power-of-sentiment-analysis-with-deep-learning/){:target="_blank"}* +- *[Sentiment Analysis with Spark NLP without Machine Learning](https://www.johnsnowlabs.com/sentiment-analysis-with-spark-nlp-without-machine-learning/){:target="_blank"}* +- *[Financial Sentiment Analysis Using SparkNLP Achieving 95% Accuracy](https://medium.com/spark-nlp/financial-sentiment-analysis-using-sparknlp-achieving-95-accuracy-e2df27744617){:target="_blank"}* + +**Notebooks** +- *[Text Classification with ClassifierDL](https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Public/5.Text_Classification_with_ClassifierDL.ipynb){:target="_blank"}* + +**Training Scripts** +- *[Training Multi-class Text and Sentiment Classification models](https://github.com/JohnSnowLabs/spark-nlp/tree/master/examples/python/training/english/classification){:target="_blank"}* +- *[Training a text classification model with INSTRUCTOR Embeddings](https://medium.com/spark-nlp/training-a-text-classification-model-with-instructor-embeddings-1a29e8c8792b){:target="_blank"}* \ No newline at end of file diff --git a/docs/en/tasks/text_generation.md b/docs/en/tasks/text_generation.md new file mode 100644 index 00000000000000..beb230117f4f03 --- /dev/null +++ b/docs/en/tasks/text_generation.md @@ -0,0 +1,180 @@ +--- +layout: docs +header: true +seotitle: +title: Text Generation +permalink: docs/en/tasks/text_generation +key: docs-tasks-text-generation +modify_date: "2024-09-28" +show_nav: true +sidebar: + nav: sparknlp +--- + +**Text generation** is the task of generating meaningful text based on a given input. It is widely used in various *natural language processing (NLP)* applications such as summarization, machine translation, conversational agents, and more. Spark NLP provides SOTA solutions for text generation, enabling you to produce high-quality and contextually relevant text outputs. + +Text generation models create text sequences by predicting the next word or sequence of words based on the input prompt. Common use cases include: + +- **Summarization:** Automatically generating concise summaries from longer text. +- **Machine Translation:** Translating text from one language to another while maintaining meaning and fluency. +- **Conversational Agents:** Building intelligent systems that can hold natural and coherent conversations with users. + +By leveraging text generation, organizations can build systems capable of generating human-like text, making it useful for content creation, automated writing, and more. + + + +## Picking a Model + +When selecting a model for text generation, consider several important factors. First, determine the **type of output** you require (e.g., summarization, translation, or free-form generation). Decide whether your task needs **structured output** like summaries or **creative text generation**. + +Next, evaluate the **style and language** of the data you'll be working withโ€”are you dealing with formal language (e.g., research papers) or informal language (e.g., social media)? Model performance metrics such as **perplexity**, **BLEU score**, or **ROUGE score** are also crucial for understanding the quality of the generated text. Finally, take into account the **computational resources** available, as some models (e.g., GPT or T5) may require significant memory and processing power. + +Explore models tailored for text generation at [Spark NLP Models](https://sparknlp.org/models), where youโ€™ll find various options for different text generation tasks. + +#### Recommended Models for Specific Text Generation Tasks + +- **Summarization:** Use models like [`t5-base`](https://sparknlp.org/2021/01/08/t5_base_en.html){:target="_blank"} and [`bart-large-cnn`](https://sparknlp.org/2023/05/11/bart_large_cnn_en.html){:target="_blank"} for general-purpose text summarization tasks. +- **Machine Translation:** Consider models such as [`t5_base`](https://sparknlp.org/2021/01/08/t5_base_en.html){:target="_blank"} and [`m2m100_418M`](https://sparknlp.org/2024/05/19/m2m100_418M_xx.html){:target="_blank"} you can also consider searching models with the [`Marian Transformer`](https://sparknlp.org/models?annotator=MarianTransformer){:target="_blank"} Annotator class for translating between non-english languages. +- **Conversational Agents:** For building chatbots and dialogue systems, use models like [`gpt2`](https://sparknlp.org/2021/12/03/gpt2_en.html){:target="_blank"} to generate coherent and contextually aware responses. + +By selecting the appropriate text generation model, you can enhance your ability to produce contextually rich and meaningful text outputs tailored to your specific NLP tasks. + +## How to use + +
+{% include programmingLanguageSelectScalaPython.html %} +```python +import sparknlp +from sparknlp.base import * +from sparknlp.annotator import * +from pyspark.ml import Pipeline + +# Assembling the document from the input text +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("documents") + +# Loading a pre-trained text generation model +# You can replace `T5Transformer.pretrained("t5_small", "xx")` with your selected model and the transformer it's based on +# For example: BartTransformer.pretrained("bart_large_cnn") +t5 = T5Transformer.pretrained("t5_small", "xx") \ + .setTask("summarize:") \ + .setInputCols(["documents"]) \ + .setMaxOutputLength(200) \ + .setOutputCol("summaries") + +# Defining the pipeline with document assembler, tokenizer, and classifier +pipeline = Pipeline().setStages([documentAssembler, t5]) + +# Creating a sample DataFrame +data = spark.createDataFrame([[ + "Transfer learning, where a model is first pre-trained on a data-rich task before being fine-tuned on a " + + "downstream task, has emerged as a powerful technique in natural language processing (NLP). The effectiveness" + + " of transfer learning has given rise to a diversity of approaches, methodology, and practice. In this " + + "paper, we explore the landscape of transfer learning techniques for NLP by introducing a unified framework " + + "that converts all text-based language problems into a text-to-text format. Our systematic study compares " + + "pre-training objectives, architectures, unlabeled data sets, transfer approaches, and other factors on dozens " + + "of language understanding tasks. By combining the insights from our exploration with scale and our new " + + "Colossal Clean Crawled Corpus, we achieve state-of-the-art results on many benchmarks covering " + + "summarization, question answering, text classification, and more. To facilitate future work on transfer " + + "learning for NLP, we release our data set, pre-trained models, and code." +]]).toDF("text") + +# Fitting the pipeline and transforming the data +result = pipeline.fit(data).transform(data) + +# Showing the results +result.select("summaries.result").show(truncate=False) + ++--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +|result | ++--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +|[transfer learning has emerged as a powerful technique in natural language processing (NLP) the effectiveness of transfer learning has given rise to a diversity of approaches, | +| methodologies, and practice .] | +---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +``` + +```scala +import spark.implicits._ +import com.johnsnowlabs.nlp.base.DocumentAssembler +import com.johnsnowlabs.nlp.annotators.seq2seq.T5Transformer +import org.apache.spark.ml.Pipeline + +// Step 1: Assembling the document from the input text +// Converts the input 'text' column into a 'document' column, required for NLP tasks +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("documents") + +// Step 3: Loading a pre-trained BERT model for token classification +// Applies a pre-trained BERT model for Named Entity Recognition (NER) to classify tokens +// `T5Transformer.pretrained()` loads the model, and `setInputCols` defines the input columns +val t5 = T5Transformer.pretrained("t5_small") + .setTask("summarize:") + .setInputCols(Array("documents")) + .setMaxOutputLength(200) + .setOutputCol("summaries") + +// Step 4: Defining the pipeline +// The pipeline stages are document assembler, tokenizer, and token classifier +val pipeline = new Pipeline().setStages(Array(documentAssembler, t5)) + +// Step 5: Creating a sample DataFrame +// Creates a DataFrame with a sample sentence that will be processed by the pipeline +val data = Seq( + "Transfer learning, where a model is first pre-trained on a data-rich task before being fine-tuned on a " + + "downstream task, has emerged as a powerful technique in natural language processing (NLP). The effectiveness" + + " of transfer learning has given rise to a diversity of approaches, methodology, and practice. In this " + + "paper, we explore the landscape of transfer learning techniques for NLP by introducing a unified framework " + + "that converts all text-based language problems into a text-to-text format. Our systematic study compares " + + "pre-training objectives, architectures, unlabeled data sets, transfer approaches, and other factors on dozens " + + "of language understanding tasks. By combining the insights from our exploration with scale and our new " + + "Colossal Clean Crawled Corpus, we achieve state-of-the-art results on many benchmarks covering " + + "summarization, question answering, text classification, and more. To facilitate future work on transfer " + + "learning for NLP, we release our data set, pre-trained models, and code." +).toDF("text") + +// Step 6: Fitting the pipeline and transforming the data +// The pipeline is fitted on the input data, then it performs the transformation to generate token labels +val result = pipeline.fit(data).transform(data) + +// Step 7: Showing the results +// Displays the 'label.result' column, which contains the Named Entity Recognition (NER) labels for each token +result.select("summaries.result").show(false) + ++--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +|result | ++--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +|[transfer learning has emerged as a powerful technique in natural language processing (NLP) the effectiveness of transfer learning has given rise to a diversity of approaches, | +|methodologies, and practice .] | ++--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +``` +
+ +## Try Real-Time Demos! + +If you want to see the outputs of text generation models in real time, visit our interactive demos: + +- **[Generative Pre-trained Transformer 2 (OpenAI GPT2)](https://huggingface.co/spaces/abdullahmubeen10/sparknlp-gpt2){:target="_blank"}** โ€“ GPT-2 generates human-like text from prompts. +- **[Text-To-Text Transfer Transformer (Google T5)](https://huggingface.co/spaces/abdullahmubeen10/sparknlp-t5){:target="_blank"}** โ€“ T5 performs text tasks like summarization and translation. +- **[SQL Query Generation](https://huggingface.co/spaces/abdullahmubeen10/sparknlp-text-to-sql-t5){:target="_blank"}** โ€“ Converts natural language commands into SQL queries. +- **[Multilingual Text Translation with MarianMT](https://huggingface.co/spaces/abdullahmubeen10/sparknlp-MarianMT){:target="_blank"}** โ€“ Translates text between multiple languages. + +## Useful Resources + +Want to dive deeper into text generation with Spark NLP? Here are some curated resources to help you get started and explore further: + +**Articles and Guides** +- *[Empowering NLP with Spark NLP and T5 Model: Text Summarization and Question Answering](https://www.johnsnowlabs.com/empowering-nlp-with-spark-nlp-and-t5-model-text-summarization-and-question-answering/){:target="_blank"}* +- *[Multilingual machine translation with Spark NLP](https://www.johnsnowlabs.com/multilingual-machine-translation-with-spark-nlp/){:target="_blank"}* + +**Notebooks** +- *[GPT2Transformer: OpenAI Text-To-Text Transformer](https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Public/14.GPT2_Transformer_In_Spark_NLP.ipynb){:target="_blank"}* +- *[LLAMA2Transformer: CausalLM wiht Open Source models](https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Public/22.0_Llama2_Transformer_In_SparkNLP.ipynb){:target="_blank"}* +- *[SQL Code Generation and Style Transfer with T5](https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Public/10.2_SQL_Code_Generation_and_Style_Transfer_with_T5.ipynb){:target="_blank"}* +- *[T5 Workshop with Spark NLP](https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Public/10.1_T5_Workshop_with_Spark_NLP.ipynb){:target="_blank"}* +- *[Translation in Spark NLP](https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Public/20.0_Translations.ipynb){:target="_blank"}* +- *[Summarization in Spark NLP](https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Public/10.2_SQL_Code_Generation_and_Style_Transfer_with_T5.ipynb){:target="_blank"}* +- *[OpenAI in SparkNLP](https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Public/22.1_OpenAI_In_SparkNLP.ipynb){:target="_blank"}* diff --git a/docs/en/tasks/text_preprocessing.md b/docs/en/tasks/text_preprocessing.md new file mode 100644 index 00000000000000..5135ffab9ac7d6 --- /dev/null +++ b/docs/en/tasks/text_preprocessing.md @@ -0,0 +1,170 @@ +--- +layout: docs +header: true +seotitle: +title: Text Preprocessing +permalink: docs/en/tasks/text_preprocessing +key: docs-tasks-text-preprocessing +modify_date: "2024-10-05" +show_nav: true +sidebar: + nav: sparknlp +--- + +**Text Preprocessing** is the foundational task of cleaning and transforming raw text data into a structured format that can be used in NLP tasks. It involves a series of steps to normalize text, remove noise, and prepare it for deeper analysis. Spark NLP provides a range of tools for efficient and scalable text preprocessing. + +## Key Preprocessing Steps + +When preprocessing text, consider the following key steps along with the recommended Spark NLP annotators: + +1. [`Tokenization:`](https://sparknlp.org/docs/en/annotators#tokenizer){:target="_blank"} Break text into smaller units (words, subwords, or sentences). +2. [`Spell Checking:`](https://sparknlp.org/docs/en/annotators#norvigsweeting-spellchecker){:target="_blank"} Correct misspelled words to improve accuracy in NLP tasks. +3. [`Normalization:`](https://sparknlp.org/docs/en/annotators#normalizer){:target="_blank"} Standardize text by converting to lowercase, expanding contractions, or removing accents. +4. [`Stopword Removal:`](https://sparknlp.org/docs/en/annotators#stopwordscleaner){:target="_blank"} Remove common, non-informative words (e.g., "the," "is," "and"). +5. [`Lemmatization:`](https://sparknlp.org/docs/en/annotators#lemmatizer){:target="_blank"} Reduce words to their base form (e.g., "running" โ†’ "run"). + +These steps and annotators will help ensure your text data is clean, consistent, and ready for analysis. + +## How to use + +
+{% include programmingLanguageSelectScalaPython.html %} +```python +import sparknlp +from sparknlp.base import * +from sparknlp.annotator import * +from pyspark.ml import Pipeline + +# Document Assembler: Converts input text into a suitable format for NLP processing +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +# Tokenizer: Splits text into individual tokens (words) +tokenizer = Tokenizer() \ + .setInputCols(["document"]) \ + .setOutputCol("tokens") + +# SpellChecker: Corrects misspelled words +spellChecker = NorvigSweetingModel.pretrained() \ + .setInputCols(["tokens"]) \ + .setOutputCol("corrected") + +# Normalizer: Cleans and standardizes text data +normalizer = Normalizer() \ + .setInputCols(["corrected"]) \ + .setOutputCol("normalized") + +# StopWordsCleaner: Removes stopwords +stopwordsCleaner = StopWordsCleaner() \ + .setInputCols(["normalized"]) \ + .setOutputCol("cleanTokens") + +# Lemmatizer: Reduces words to their base form +lemmatizer = LemmatizerModel.pretrained() \ + .setInputCols(["cleanTokens"]) \ + .setOutputCol("lemmas") + +# Pipeline: Assembles the document assembler and preprocessing stages +pipeline = Pipeline().setStages([ + documentAssembler, tokenizer, spellChecker, normalizer, stopwordsCleaner, lemmatizer +]) + +# Input Data: A small example dataset is created and converted to a DataFrame +data = spark.createDataFrame([["Text preprocessing is essential in NLP!"]]).toDF("text") + +# Running the Pipeline: Fits the pipeline to the data and preprocesses the text +result = pipeline.fit(data).transform(data) + +# Output: Displays the processed tokens and lemmas +result.select("lemmas.result").show(truncate=False) + ++----------------------------------------------------+ +|lemmas.result | ++----------------------------------------------------+ +|[text, preprocess, essential, in, NLP] | ++----------------------------------------------------+ +``` +```scala +import com.johnsnowlabs.nlp.DocumentAssembler +import com.johnsnowlabs.nlp.annotator._ +import org.apache.spark.ml.Pipeline +import spark.implicits._ + +// Document Assembler: Converts input text into a suitable format for NLP processing +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +// Tokenizer: Splits text into individual tokens (words) +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("tokens") + +// SpellChecker: Corrects misspelled words +val spellChecker = NorvigSweetingModel.pretrained() + .setInputCols(Array("tokens")) + .setOutputCol("corrected") + +// Normalizer: Cleans and standardizes text data +val normalizer = new Normalizer() + .setInputCols(Array("corrected")) + .setOutputCol("normalized") + +// StopWordsCleaner: Removes stopwords +val stopwordsCleaner = new StopWordsCleaner() + .setInputCols(Array("normalized")) + .setOutputCol("cleanTokens") + +// Lemmatizer: Reduces words to their base form +val lemmatizer = LemmatizerModel.pretrained() + .setInputCols(Array("cleanTokens")) + .setOutputCol("lemmas") + +// Pipeline: Assembles the document assembler and preprocessing stages +val pipeline = new Pipeline().setStages(Array( + documentAssembler, tokenizer, spellChecker, normalizer, stopwordsCleaner, lemmatizer +)) + +// Input Data: A small example dataset is created and converted to a DataFrame +val data = Seq("Text preprocessing is essential in NLP!").toDF("text") + +// Running the Pipeline: Fits the pipeline to the data and preprocesses the text +val result = pipeline.fit(data).transform(data) + +// Display the results +result.select("lemmas.result").show(false) + ++----------------------------------------------------+ +|result | ++----------------------------------------------------+ +|[text, preprocess, essential, in, NLP] | ++----------------------------------------------------+ +``` +
+ +## Try Real-Time Demos! + +If you want to see text preprocessing in real-time, check out our interactive demos: + +- **[Text Preprocessing with Spark NLP](https://huggingface.co/spaces/abdullahmubeen10/sparknlp-text-preprocessing){:target="_blank"}** โ€“ Explore how Spark NLP preprocesses raw text data. +- **[Stopwords Removing with Spark NLP](https://huggingface.co/spaces/abdullahmubeen10/sparknlp-stop-words-removal){:target="_blank"}** โ€“ Explore how Spark NLP removes stop words from text. + +## Useful Resources + +Want to learn more about text preprocessing with Spark NLP? Explore the following resources: + +**Articles and Guides** +- *[Text cleaning: removing stopwords from text with Spark NLP](https://www.johnsnowlabs.com/text-cleaning-removing-stopwords-from-text-with-spark-nlp/){:target="_blank"}* +- *[Unleashing the Power of Text Tokenization with Spark NLP](https://www.johnsnowlabs.com/unleashing-the-power-of-text-tokenization-with-spark-nlp/){:target="_blank"}* +- *[Tokenizing Asian texts into words with word segmentation models in Spark NLP](https://medium.com/john-snow-labs/tokenizing-asian-texts-into-words-with-word-segmentation-models-42e04d8e03da){:target="_blank"}* +- *[Text Cleaning: Standard Text Normalization with Spark NLP](https://www.johnsnowlabs.com/text-cleaning-standard-text-normalization-with-spark-nlp/){:target="_blank"}* +- *[Boost Your NLP Results with Spark NLP Stemming and Lemmatizing Techniques](https://www.johnsnowlabs.com/boost-your-nlp-results-with-spark-nlp-stemming-and-lemmatizing-techniques/){:target="_blank"}* +- *[Sample Text Data Preprocessing Implementation In SparkNLP](https://ahmetemin-tek.medium.com/sample-text-data-preprocessing-implementation-in-sparknlp-5de53085fed6){:target="_blank"}* + +**Notebooks** +- *[Text Preprocessing with SparkNLP Annotators Transformers](https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Public/2.Text_Preprocessing_with_SparkNLP_Annotators_Transformers.ipynb){:target="_blank"}* +- *[Text_Preprocessing_with_SparkNLP](https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/Text_Preprocessing_with_SparkNLP.ipynb){:target="_blank"}* +- *[Word Stemming with Stemmer](https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/stemmer/Word_Stemming_with_Stemmer.ipynb){:target="_blank"}* +- *[Document Normalizer](https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/document-normalizer/document_normalizer_notebook.ipynb){:target="_blank"}* +- *[Cleaning Stop Words](https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/stop-words/StopWordsCleaner.ipynb){:target="_blank"}* diff --git a/docs/en/tasks/token_classification.md b/docs/en/tasks/token_classification.md new file mode 100644 index 00000000000000..b2703e057f50a8 --- /dev/null +++ b/docs/en/tasks/token_classification.md @@ -0,0 +1,176 @@ +--- +layout: docs +header: true +seotitle: +title: Token Classification +permalink: docs/en/tasks/token_classification +key: docs-tasks-token-classification +modify_date: "2024-09-26" +show_nav: true +sidebar: + nav: sparknlp +--- + +**Token classification** is the task of assigning a **label** to each token (word or sub-word) in a given text sequence. It is fundamental in various *natural language processing (NLP)* tasks like named entity recognition (NER), part-of-speech tagging (POS), and more. Spark NLP provides state of the art solutions to tackle token classification challenges effectively, helping you analyze and label individual tokens in a document. + +Token classification involves processing text at a granular level, labeling each token for its role or entity. Typical use cases include: + +- **Named Entity Recognition (NER):** Identifying proper names, locations, organizations, etc., within text. +- **Part-of-Speech Tagging (POS):** Labeling each token with its grammatical category (e.g., noun, verb, adjective). + +By utilizing token classification, organizations can enhance their ability to extract detailed insights from text data, enabling applications like information extraction, text annotation, and more. + +
+ +
+ +## Picking a Model + +When selecting a model for token classification, it's important to consider various factors that impact performance. First, analyze the **type of entities or tags** you want to classify (e.g., named entities, parts of speech). Determine if your task requires **fine-grained tagging** (such as multiple types of named entities) or a simpler tag set. + +Next, assess the **complexity of your data**โ€”does it involve formal text like news articles, or informal text like social media posts? **Model performance metrics** (e.g., precision, recall, F1 score) are also key to determining whether a model is suitable. Lastly, evaluate your **computational resources**, as more complex models like BERT may require greater memory and processing power. + +You can explore and select models for your token classification tasks at [Spark NLP Models](https://sparknlp.org/models), where you'll find various models for specific datasets and challenges. + +#### Recommended Models for Specific Token Classification Tasks + +- **Named Entity Recognition (NER):** Use models like [`bert-base-NER`](https://sparknlp.org/2022/05/09/bert_ner_bert_base_NER_en_3_0.html){:target="_blank"} and [`xlm-roberta-large-finetuned-conll03-english`](https://sparknlp.org/2022/08/14/xlmroberta_ner_large_finetuned_conll03_english_xx_3_0.html){:target="_blank"} for general-purpose NER tasks. +- **Part-of-Speech Tagging (POS):** For POS tagging, consider using models such as [`pos_anc`](https://sparknlp.org/2021/03/05/pos_anc.html){:target="_blank"}. +- **Healthcare NER:** For clinical texts, [`ner_jsl`](https://nlp.johnsnowlabs.com/2022/10/19/ner_jsl_en.html){:target="_blank"} and [`pos_clinical`](https://sparknlp.org/2023/02/17/ner_jsl_en.html){:target="_blank"} is tailored for extracting medical entities. + +If existing models do not meet your requirements, you can train your own custom model using the [Spark NLP Training Documentation](https://sparknlp.org/docs/en/training). + +By selecting the appropriate model, you can optimize token classification performance for your specific NLP tasks. + +## How to use + +
+{% include programmingLanguageSelectScalaPython.html %} +```python +import sparknlp +from sparknlp.base import * +from sparknlp.annotator import * +from pyspark.ml import Pipeline + +# Assembling the document from the input text +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +# Tokenizing the text +tokenizer = Tokenizer() \ + .setInputCols(["document"]) \ + .setOutputCol("token") + +# Loading a pre-trained sequence classification model +# You can replace `BertForTokenClassification.pretrained()` with your selected model and the transformer it's based on +# For example: XlmRoBertaForTokenClassification.pretrained("xlmroberta_ner_large_finetuned_conll03_english","xx") +tokenClassifier = BertForTokenClassification.pretrained() \ + .setInputCols(["token", "document"]) \ + .setOutputCol("label") \ + .setCaseSensitive(True) + +# Defining the pipeline with document assembler, tokenizer, and classifier +pipeline = Pipeline().setStages([ + documentAssembler, + tokenizer, + tokenClassifier +]) + +# Creating a sample DataFrame +data = spark.createDataFrame([["John Lenon was born in London and lived in Paris. My name is Sarah and I live in London"]]).toDF("text") + +# Fitting the pipeline and transforming the data +result = pipeline.fit(data).transform(data) + +# Showing the results +result.select("label.result").show(truncate=False) + + +``` + +```scala +import spark.implicits._ +import com.johnsnowlabs.nlp.base._ +import com.johnsnowlabs.nlp.annotator._ +import org.apache.spark.ml.Pipeline + +// Step 1: Assembling the document from the input text +// Converts the input 'text' column into a 'document' column, required for NLP tasks +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +// Step 2: Tokenizing the text +// Splits the 'document' column into tokens (words), creating the 'token' column +val tokenizer = new Tokenizer() + .setInputCols("document") + .setOutputCol("token") + +// Step 3: Loading a pre-trained BERT model for token classification +// Applies a pre-trained BERT model for Named Entity Recognition (NER) to classify tokens +// `BertForTokenClassification.pretrained()` loads the model, and `setInputCols` defines the input columns +val tokenClassifier = BertForTokenClassification.pretrained() + .setInputCols("token", "document") + .setOutputCol("label") + .setCaseSensitive(true) + +// Step 4: Defining the pipeline +// The pipeline stages are document assembler, tokenizer, and token classifier +val pipeline = new Pipeline().setStages(Array( + documentAssembler, + tokenizer, + tokenClassifier +)) + +// Step 5: Creating a sample DataFrame +// Creates a DataFrame with a sample sentence that will be processed by the pipeline +val data = Seq("John Lenon was born in London and lived in Paris. My name is Sarah and I live in London").toDF("text") + +// Step 6: Fitting the pipeline and transforming the data +// The pipeline is fitted on the input data, then it performs the transformation to generate token labels +val result = pipeline.fit(data).transform(data) + +// Step 7: Showing the results +// Displays the 'label.result' column, which contains the Named Entity Recognition (NER) labels for each token +result.select("label.result").show(false) + +// Output: +// +------------------------------------------------------------------------------------+ +// |result | +// +------------------------------------------------------------------------------------+ +// |[B-PER, I-PER, O, O, O, B-LOC, O, O, O, B-LOC, O, O, O, O, B-PER, O, O, O, O, B-LOC]| +// +------------------------------------------------------------------------------------+ +``` +
+ +## Try Real-Time Demos! + +If you want to see the outputs of text classification models in real time, visit our interactive demos: + +- **[BERT Annotators Demo](https://huggingface.co/spaces/abdullahmubeen10/sparknlp-bert-annotators){:target="_blank"}** โ€“ A live demo where you can try your inputs on classification models on the go. +- **[Named Entity Recognition (NER)](https://huggingface.co/spaces/abdullahmubeen10/sparknlp-named-entity-recognition){:target="_blank"}** โ€“ A live demo where you can try your inputs on NER models on the go. +- **[POS Tagging](https://huggingface.co/spaces/abdullahmubeen10/sparknlp-POS-tagging){:target="_blank"}** โ€“ A live demo where you can try your inputs on preception models on the go. +- **[Recognize Entities - Live Demos & Notebooks](https://sparknlp.org/recognize_entitie){:target="_blank"}** โ€“ An interactive demo for Recognizing Entities in text + +## Useful Resources + +Want to dive deeper into text classification with Spark NLP? Here are some curated resources to help you get started and explore further: + +**Articles and Guides** +- *[Named Entity Recognition (NER) with BERT in Spark NLP](https://www.johnsnowlabs.com/named-entity-recognition-ner-with-bert-in-spark-nlp/){:target="_blank"}* +- *[The Ultimate Guide to Rule-based Entity Recognition with Spark NLP](https://www.johnsnowlabs.com/rule-based-entity-recognition-with-spark-nlp/){:target="_blank"}* +- *[In-Depth Comparison of Spark NLP for Healthcare and ChatGPT on Clinical Named Entity Recognition](https://www.johnsnowlabs.com/in-depth-comparison-of-spark-nlp-for-healthcare-and-chatgpt-on-clinical-named-entity-recognition/){:target="_blank"}* + +**Notebooks** +- *[Transformers for Token Classification in Spark NLP](https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Public/4.3_Transformers_for_Token_Classification_in_Spark_NLP.ipynb){:target="_blank"}* + +**Training Scripts** +- *[Training Named Entity Recognition (NER) Deep-Learning models](https://github.com/JohnSnowLabs/spark-nlp/tree/master/examples/python/training/english/dl-ner){:target="_blank"}* +- *[Training Conditional Random Fields (CRF) Named Entity Recognition (NER) models](https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/training/english/crf-ner/ner_dl_crf.ipynb){:target="_blank"}* \ No newline at end of file diff --git a/docs/en/tasks/translation.md b/docs/en/tasks/translation.md new file mode 100644 index 00000000000000..ad379da7ce5101 --- /dev/null +++ b/docs/en/tasks/translation.md @@ -0,0 +1,135 @@ +--- +layout: docs +header: true +seotitle: +title: Translation +permalink: docs/en/tasks/translation +key: docs-tasks-translation +modify_date: "2024-09-28" +show_nav: true +sidebar: + nav: sparknlp +--- + +**Translation** is the task of converting text from one language into another. This is essential for multilingual applications such as content localization, cross-language communication, and more. Spark NLP offers advanced translation models that provide high-quality translations between multiple languages. + +Translation models process input text in the source language and generate a corresponding translation in the target language. Common use cases include: + +- **Cross-Language Communication:** Enabling communication across different languages for global teams. +- **Document Translation:** Translating long-form content such as reports, articles, or manuals. + +By using Spark NLP translation models, you can build scalable translation systems to meet your multilingual needs efficiently and accurately. + +## Picking a Model + +When choosing a translation model, consider factors such as the **source and target languages** and the **size of the input text**. Some models may specialize in specific language pairs or offer better performance for certain types of text (e.g., formal versus informal content). Evaluate whether you need **document-level translation** or **sentence-level translation** based on the use case. + +Explore the available translation models at [Spark NLP Models](https://sparknlp.org/models) to find the one that best suits your translation tasks. + +#### Recommended Models for Translation Tasks + +- **General Translation:** Consider models such as [`t5_base`](https://sparknlp.org/2021/01/08/t5_base_en.html){:target="_blank"} and [`m2m100_418M`](https://sparknlp.org/2024/05/19/m2m100_418M_xx.html){:target="_blank"} you can also consider searching models with the [`Marian Transformer`](https://sparknlp.org/models?annotator=MarianTransformer){:target="_blank"} Annotator class for translating between non-english languages. + +Selecting the appropriate model will ensure you produce accurate and fluent translations, tailored to your specific language pair and domain. + +## How to use + +
+{% include programmingLanguageSelectScalaPython.html %} +```python +import sparknlp +from sparknlp.base import * +from sparknlp.annotator import * +from pyspark.ml import Pipeline + +# Document Assembler: Converts input text into a suitable format for NLP processing +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("documents") + +# M2M100 Transformer: Loads the pretrained translation model for English to French +m2m100 = M2M100Transformer.pretrained("m2m100_418M") \ + .setInputCols(["documents"]) \ + .setMaxOutputLength(50) \ + .setOutputCol("generation") \ + .setSrcLang("zh") \ # Source language: Chinese + .setTgtLang("en") # Target language: English + +# Pipeline: Assembles the document assembler and the M2M100 translation model +pipeline = Pipeline().setStages([documentAssembler, m2m100]) + +# Input Data: A small example dataset is created and converted to a DataFrame +data = spark.createDataFrame([["็”Ÿๆดปๅฐฑๅƒไธ€็›’ๅทงๅ…‹ๅŠ›ใ€‚"]]).toDF("text") + +# Running the Pipeline: Fits the pipeline to the data and generates translations +result = pipeline.fit(data).transform(data) + +# Output: Displays the translated result +result.select("summaries.generation").show(truncate=False) + ++-------------------------------------------------------------------------------------------+ +|result | ++-------------------------------------------------------------------------------------------+ +|[ Life is like a box of chocolate.] | ++-------------------------------------------------------------------------------------------+ +``` + +```scala +import spark.implicits._ +import com.johnsnowlabs.nlp.base.DocumentAssembler +import com.johnsnowlabs.nlp.annotators.seq2seq.M2M100Transformer +import org.apache.spark.ml.Pipeline + +// Document Assembler: Converts input text into a suitable format for NLP processing +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("documents") + +// M2M100 Transformer: Loads the pretrained translation model for Chinese to English +val m2m100 = M2M100Transformer.pretrained("m2m100_418M") + .setInputCols(Array("documents")) + .setSrcLang("zh") // Source language: Chinese + .serTgtLang("en") // Target language: English + .setMaxOutputLength(100) + .setDoSample(false) + .setOutputCol("generation") + +// Pipeline: Assembles the document assembler and the M2M100 translation model +val pipeline = new Pipeline().setStages(Array(documentAssembler, m2m100)) + +// Input Data: A small example dataset is created and converted to a DataFrame +val data = Seq("็”Ÿๆดปๅฐฑๅƒไธ€็›’ๅทงๅ…‹ๅŠ›ใ€‚").toDF("text") + +// Running the Pipeline: Fits the pipeline to the data and generates translations +val result = pipeline.fit(data).transform(data) + +// Output: Displays the translated result +result.select("generation.result").show(truncate = false) + ++-------------------------------------------------------------------------------------------+ +|result | ++-------------------------------------------------------------------------------------------+ +|[ Life is like a box of chocolate.] | ++-------------------------------------------------------------------------------------------+ +``` +
+ +## Try Real-Time Demos! + +If you want to see the outputs of text generation models in real time, visit our interactive demos: + +- **[Text-To-Text Transfer Transformer (Google T5)](https://huggingface.co/spaces/abdullahmubeen10/sparknlp-t5){:target="_blank"}** โ€“ T5 performs text tasks like summarization and translation. +- **[Multilingual Text Translation with MarianMT](https://huggingface.co/spaces/abdullahmubeen10/sparknlp-MarianMT){:target="_blank"}** โ€“ Translates text between multiple languages. +- **[M2M100 Multilingual Translation Model](https://huggingface.co/spaces/abdullahmubeen10/sparknlp-M2M100){:target="_blank"}** โ€“ Translates text between multiple languages. + +## Useful Resources + +Want to dive deeper into text generation with Spark NLP? Here are some curated resources to help you get started and explore further: + +**Articles and Guides** +- *[Multilingual machine translation with Spark NLP](https://www.johnsnowlabs.com/multilingual-machine-translation-with-spark-nlp/){:target="_blank"}* +- *[Use Spark NLP offline models for Language Translation](https://www.linkedin.com/pulse/use-spark-nlp-offline-models-language-translation-mei-wu/){:target="_blank"}* + +**Notebooks** +- *[T5 Workshop with Spark NLP](https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Public/10.1_T5_Workshop_with_Spark_NLP.ipynb){:target="_blank"}* +- *[Translation in Spark NLP](https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Public/20.0_Translations.ipynb){:target="_blank"}* \ No newline at end of file diff --git a/docs/en/tasks/zero_shot_classification.md b/docs/en/tasks/zero_shot_classification.md new file mode 100644 index 00000000000000..221493b8741048 --- /dev/null +++ b/docs/en/tasks/zero_shot_classification.md @@ -0,0 +1,152 @@ +--- +layout: docs +header: true +seotitle: +title: Zero-Shot Classification +permalink: docs/en/tasks/zero_shot_classification +key: docs-tasks-zero_shot_classification +modify_date: "2024-09-26" +show_nav: true +sidebar: + nav: sparknlp +--- + +**Zero-Shot Classification** is a method of classifying unseen labels in text without needing any prior training data for those labels. This technique is especially useful for scenarios where pre-defined categories are not available, allowing for flexibility in categorizing text based on descriptions of labels alone. Spark NLP offers state-of-the-art solutions for zero-shot classification, enabling users to classify texts into various categories even when no labeled data is available. + +Zero-shot classification processes text at a broader level, where the system predicts the most relevant labels based on their descriptions. Typical use cases include: + +- **Text Categorization:** Automatically classifying text into a set of predefined or custom categories based on label descriptions. + +By leveraging zero-shot classification, organizations can classify large volumes of text data without the need to curate annotated datasets for each possible label, significantly reducing manual efforts in text annotation and data preparation. + +## Picking a Model + +When selecting a model for zero-shot classification, it is important to consider several factors that impact performance. First, analyze the **range of labels or categories** you want to classify. Zero-shot classification is versatile, but choosing models trained on broader datasets often yields better results. + +Next, consider the **complexity of your text**. Is it formal or informal? Does it involve domain-specific language such as legal or healthcare text? **Performance metrics** (e.g., accuracy, precision, recall) help assess whether a model fits your requirements. Additionally, ensure you evaluate your **computational resources**, as larger models, like those based on transformer architectures, may require significant memory and processing power. + +You can explore and select models for your zero-shot classification tasks at [Spark NLP Models](https://sparknlp.org/models), where you'll find a variety of models for specific datasets and classification challenges. + +#### Recommended Models for Zero-Shot Classification + +- **Zero-Shot Text Classification:** Consider using models like [`bart-large-mnli`](https://sparknlp.org/2024/08/27/bart_large_zero_shot_classifier_mnli_en.html){:target="_blank"} for general-purpose multilingual text data classification across various domains. +- **Zero-Shot Named Entity Recognition (NER):** Use models like [`zero_shot_ner_roberta`](https://sparknlp.org/2023/02/08/zero_shot_ner_roberta_en.html){:target="_blank"} for identifying entities across various domains and languages without requiring task-specific labeled data. + +If pre-trained models don't match your exact needs, you can train your own custom model using the [Spark NLP Training Documentation](https://sparknlp.org/docs/en/training). + +By selecting the appropriate zero-shot classification model, you can expand your ability to analyze text data without predefined labels, providing flexibility for dynamic and evolving classification tasks. + +## How to use + +
+{% include programmingLanguageSelectScalaPython.html %} +```python +import sparknlp +from sparknlp.base import * +from sparknlp.annotator import * +from pyspark.ml import Pipeline + +# 1. Document Assembler: Converts raw input text into a document format. +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +# 2. Tokenizer: Splits the document into individual tokens (words). +tokenizer = Tokenizer() \ + .setInputCols(["document"]) \ + .setOutputCol("token") + +# 3. Pre-trained Sequence Classifier (Zero-Shot Classification): Loads a pre-trained BART model for zero-shot classification. +sequenceClassifier = BartForZeroShotClassification.pretrained() \ + .setInputCols(["token", "document"]) \ + .setOutputCol("label") \ + .setCaseSensitive(True) + +# 4. Pipeline: Defines a pipeline with three stages - document assembler, tokenizer, and zero-shot classifier. +pipeline = Pipeline().setStages([ + documentAssembler, + tokenizer, + sequenceClassifier +]) + +# 5. Sample Data: Creating a DataFrame with sample text data to test zero-shot classification. +data = spark.createDataFrame([["I loved this movie when I was a child.", "It was pretty boring."]]).toDF("text") + +# 6. Fit and Transform: Fits the pipeline to the data and applies the model for classification. +result = pipeline.fit(data).transform(data) + +# 7. Displaying Results: Shows the classification labels assigned to each text (e.g., positive or negative sentiment). +result.select("label.result").show(truncate=False) + + +``` + +```scala +import spark.implicits._ +import com.johnsnowlabs.nlp.base._ +import com.johnsnowlabs.nlp.annotator._ +import org.apache.spark.ml.Pipeline + +// Assembling the document from the input text +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +// Tokenizing the text +val tokenizer = new Tokenizer() + .setInputCols("document") + .setOutputCol("token") + +// Loading the pre-trained zero-shot classification model (BERT) +val sequenceClassifier = BertForZeroShotClassification.pretrained() + .setInputCols("token", "document") + .setOutputCol("label") + .setCaseSensitive(true) + +// Creating a pipeline with document assembler, tokenizer, and classifier +val pipeline = new Pipeline().setStages(Array( + documentAssembler, + tokenizer, + sequenceClassifier +)) + +// Creating a sample DataFrame +val data = Seq("I loved this movie when I was a child.", "It was pretty boring.").toDF("text") + +// Fitting the pipeline and transforming the data +val result = pipeline.fit(data).transform(data) + +// Showing the results +result.select("label.result").show(false) + +// Sample Output: +// +------+ +// |result| +// +------+ +// |[pos] | +// |[neg] | +// +------+ +``` +
+ +## Try Real-Time Demos! + +If you want to see the outputs of text classification models in real time, visit our interactive demos: + +- **[BERT Annotators Demo](https://huggingface.co/spaces/abdullahmubeen10/sparknlp-bert-annotators){:target="_blank"}** โ€“ A live demo where you can try your labels and inputs on zero shot classification models on the go. +- **[Zero-Shot Named Entity Recognition (NER)](https://huggingface.co/spaces/abdullahmubeen10/sparknlp-Zero-Shot-NER){:target="_blank"}** โ€“ A live demo where you can try your labels and inputs on zero shot classification models on the go. + +## Useful Resources + +Want to dive deeper into text classification with Spark NLP? Here are some curated resources to help you get started and explore further: + +**Notebooks** +- *[Zero-Shot Text Classification in Spark NLP](https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Public/5.4_ZeroShot_Text_Classification.ipynb){:target="_blank"}* +- *[Zero-Shot for Named Entity Recognition](https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Public/4.2_ZeroShot_NER.ipynb){:target="_blank"}* diff --git a/docs/en/tasks/zero_shot_image_classification.md b/docs/en/tasks/zero_shot_image_classification.md new file mode 100644 index 00000000000000..d3744a2de7e463 --- /dev/null +++ b/docs/en/tasks/zero_shot_image_classification.md @@ -0,0 +1,186 @@ +--- +layout: docs +header: true +seotitle: +title: Zero-shot Image Classification +permalink: docs/en/tasks/zero_shot_image_classification +key: docs-tasks-zero-shot-image-classification +modify_date: "2024-09-26" +show_nav: true +sidebar: + nav: sparknlp +--- + +**Zero-shot image classification** is a technique in computer vision where a model can classify images into categories that it has never seen before during training. This is achieved by leveraging semantic relationships between the image data and textual descriptions of classes, enabling models to predict labels without specific training on each category. + +This task is particularly useful for scenarios where obtaining labeled data for every possible category is challenging or expensive, such as real-world applications in e-commerce, media, or biology. Zero-shot classification can help scale image recognition systems without constantly retraining them for new categories. + +## How Zero-shot Image Classification Works + +The key idea behind zero-shot learning is the generalization capability of models. Instead of being restricted to the labels encountered during training, the model uses external knowledgeโ€”typically in the form of text or word embeddingsโ€”to make predictions about new classes. + +In Spark NLP, zero-shot image classification leverages models like CLIP (Contrastive Languageโ€“Image Pretraining), which are trained to understand both visual and textual data. These models align the visual representations of images with the semantic representations of text, allowing them to match unseen image categories based on their descriptions. + +Some common use cases include: + +- **Classifying new product images** in an e-commerce platform without retraining the model for every new product. +- **Detecting rare or new species of animals** using images in wildlife research. +- **Media categorization** for content recommendation engines where new labels continuously emerge. + +## Picking a Model + +When choosing a model for zero-shot image classification, you need to consider several factors: + +- **Text and Image Alignment:** Choose models that are good at matching visual features to text-based descriptions. +- **Task Complexity:** Depending on the complexity of the task, a larger pre-trained model like CLIP or a fine-tuned ViT model might perform better. +- **Efficiency:** While zero-shot classification saves time by avoiding retraining, some models are more resource-intensive than others. Make sure the model is efficient enough for your computational setup. + +You can explore a variety of pre-trained zero-shot models on the [Spark NLP Models](https://sparknlp.org/models){:target="_blank"}, where models suited for different tasks and datasets are available. + +#### Recommended Models for Zero-shot Image Classification +- **CLIP for General Zero-shot Image Classification:** Models like [`clip_vit_large_patch14 `](https://sparknlp.org/2024/09/24/clip_vit_large_patch14_en.html){:target="_blank"} and [`clip-vit-base-patch32`](https://sparknlp.org/2023/12/02/zero_shot_classifier_clip_vit_base_patch32_en.html){:target="_blank"} are well-suited for matching image content with textual labels in a zero-shot setting. + +## How to use + +
+{% include programmingLanguageSelectScalaPython.html %} +```python +import sparknlp +from sparknlp.base import * +from sparknlp.annotator import * +from pyspark.ml import Pipeline + +# Loading images into a Spark DataFrame, with an option to discard invalid images +imageDF = spark.read \ + .format("image") \ + .option("dropInvalid", value=True) \ + .load("src/test/resources/image/") + +# Assembling image data using the ImageAssembler, preparing the input images for further processing +imageAssembler = ImageAssembler() \ + .setInputCol("image") \ + .setOutputCol("image_assembler") + +# Defining candidate labels for zero-shot classification +candidateLabels = [ + "a photo of a bird", + "a photo of a cat", + "a photo of a dog", + "a photo of a hen", + "a photo of a hippo", + "a photo of a room", + "a photo of a tractor", + "a photo of an ostrich", + "a photo of an ox" +] + +# Initializing the CLIPForZeroShotClassification model +imageClassifier = CLIPForZeroShotClassification \ + .pretrained("clip_vit_large_patch14", "en") \ + .setInputCols(["image_assembler"]) \ + .setOutputCol("label") \ + .setCandidateLabels(candidateLabels) + +# Defining a Spark ML pipeline with two stages: the ImageAssembler and the CLIP image classifier +pipeline = Pipeline().setStages([imageAssembler, imageClassifier]) + +# Fitting the pipeline on the image DataFrame and transforming the data to apply classification +pipelineDF = pipeline.fit(imageDF).transform(imageDF) + +# Selecting the image file name and the predicted label result, displaying the output in a readable format +pipelineDF \ + .selectExpr("reverse(split(image.origin, '/'))[0] as image_name", "label.result") \ + .show(truncate=False) + ++-----------------+-----------------------+ +|image_name |result | ++-----------------+-----------------------+ +|palace.JPEG |[a photo of a room] | +|egyptian_cat.jpeg|[a photo of a cat] | +|hippopotamus.JPEG|[a photo of a hippo] | +|hen.JPEG |[a photo of a hen] | +|ostrich.JPEG |[a photo of an ostrich]| +|junco.JPEG |[a photo of a bird] | +|bluetick.jpg |[a photo of a dog] | +|chihuahua.jpg |[a photo of a dog] | +|tractor.JPEG |[a photo of a tractor] | +|ox.JPEG |[a photo of an ox] | ++-----------------+-----------------------+ +``` +```scala +import com.johnsnowlabs.nlp.ImageAssembler +import com.johnsnowlabs.nlp.annotator._ +import org.apache.spark.ml.Pipeline + +// Loading image data into a Spark DataFrame, removing any invalid images +val imageDF = ResourceHelper.spark.read + .format("image") + .option("dropInvalid", value = true) + .load("src/test/resources/image/") + +// Assembling the images with the ImageAssembler, which prepares image data for processing +val imageAssembler: ImageAssembler = new ImageAssembler() + .setInputCol("image") + .setOutputCol("image_assembler") + +// Defining an array of candidate labels for zero-shot image classification +val candidateLabels = Array( + "a photo of a bird", + "a photo of a cat", + "a photo of a dog", + "a photo of a hen", + "a photo of a hippo", + "a photo of a room", + "a photo of a tractor", + "a photo of an ostrich", + "a photo of an ox" +) + +// Initializing the CLIPForZeroShotClassification model, setting input and output columns +// The model classifies images based on comparison to the candidate labels +val imageClassifier = CLIPForZeroShotClassification + .pretrained() // Loading a pretrained CLIP model + .setInputCols("image_assembler") + .setOutputCol("label") + .setCandidateLabels(candidateLabels) + +// Creating and running the Spark ML pipeline with the image assembler and classifier +val pipeline = + new Pipeline().setStages(Array(imageAssembler, imageClassifier)).fit(imageDF).transform(imageDF) + +// Selecting and displaying the image file name and classification result +pipeline + .selectExpr("reverse(split(image.origin, '/'))[0] as image_name", "label.result") // Extracting image names and their classification labels + .show(truncate = false) + ++-----------------+-----------------------+ +|image_name |result | ++-----------------+-----------------------+ +|palace.JPEG |[a photo of a room] | +|egyptian_cat.jpeg|[a photo of a cat] | +|hippopotamus.JPEG|[a photo of a hippo] | +|hen.JPEG |[a photo of a hen] | +|ostrich.JPEG |[a photo of an ostrich]| +|junco.JPEG |[a photo of a bird] | +|bluetick.jpg |[a photo of a dog] | +|chihuahua.jpg |[a photo of a dog] | +|tractor.JPEG |[a photo of a tractor] | +|ox.JPEG |[a photo of an ox] | ++-----------------+-----------------------+ +``` +
+ +## Try Real-Time Demos! + +Explore zero-shot image classification with our interactive demos: + +- **[CLIP for Zero-shot Image Classification](https://huggingface.co/spaces/abdullahmubeen10/sparknlp-CLIPForZeroShotClassification){:target="_blank"}** + +## Useful Resources + +Learn zero-shot image classification with Spark NLP: + +**Notebooks** +- *[CLIP Classification Notebook](https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/image/CLIPForZeroShotClassification.ipynb){:target="_blank"}* + +Discover how to classify images without labeled data. From 0102a1bbf1d679a1db7228c5e17b57494a147a34 Mon Sep 17 00:00:00 2001 From: Danilo Burbano <37355249+danilojsl@users.noreply.github.com> Date: Fri, 18 Oct 2024 13:12:57 -0500 Subject: [PATCH 09/24] Introducing BertForMultipleChoice transformer (#14435) * [SPARKNLP-1084] Introducing BertForMultipleChoice * [SPARKNLP-1084] Introducing BertForMultipleChoice transformer --- .../annotator/classifier_dl/__init__.py | 2 +- .../classifier_dl/bert_for_multiple_choice.py | 161 +++++++++ python/sparknlp/internal/__init__.py | 7 + .../bert_for_multiple_choice_test.py | 76 ++++ .../ml/ai/BertClassification.scala | 92 ++++- .../ml/ai/XXXForClassification.scala | 71 ++++ .../ml/ai/util/PrepareEmbeddings.scala | 44 ++- .../classifier/dl/BertForMultipleChoice.scala | 334 ++++++++++++++++++ .../nlp/pretrained/ResourceDownloader.scala | 4 +- .../dl/BertForMultipleChoiceTestSpec.scala | 82 +++++ 10 files changed, 860 insertions(+), 13 deletions(-) create mode 100644 python/sparknlp/annotator/classifier_dl/bert_for_multiple_choice.py create mode 100644 python/test/annotator/classifier_dl/bert_for_multiple_choice_test.py create mode 100644 src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/BertForMultipleChoice.scala create mode 100644 src/test/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/BertForMultipleChoiceTestSpec.scala diff --git a/python/sparknlp/annotator/classifier_dl/__init__.py b/python/sparknlp/annotator/classifier_dl/__init__.py index bbd9f60a8dfbba..2b5e30fc3ff359 100644 --- a/python/sparknlp/annotator/classifier_dl/__init__.py +++ b/python/sparknlp/annotator/classifier_dl/__init__.py @@ -54,4 +54,4 @@ from sparknlp.annotator.classifier_dl.mpnet_for_token_classification import * from sparknlp.annotator.classifier_dl.albert_for_zero_shot_classification import * from sparknlp.annotator.classifier_dl.camembert_for_zero_shot_classification import * - +from sparknlp.annotator.classifier_dl.bert_for_multiple_choice import * diff --git a/python/sparknlp/annotator/classifier_dl/bert_for_multiple_choice.py b/python/sparknlp/annotator/classifier_dl/bert_for_multiple_choice.py new file mode 100644 index 00000000000000..2c27f913e56fcc --- /dev/null +++ b/python/sparknlp/annotator/classifier_dl/bert_for_multiple_choice.py @@ -0,0 +1,161 @@ +# Copyright 2017-2024 John Snow Labs +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from sparknlp.common import * + +class BertForMultipleChoice(AnnotatorModel, + HasCaseSensitiveProperties, + HasBatchedAnnotate, + HasEngine, + HasMaxSentenceLengthLimit): + """BertForMultipleChoice can load BERT Models with a multiple choice classification head on top + (a linear layer on top of the pooled output and a softmax) e.g. for RocStories/SWAG tasks. + + Pretrained models can be loaded with :meth:`.pretrained` of the companion + object: + + >>> spanClassifier = BertForMultipleChoice.pretrained() \\ + ... .setInputCols(["document_question", "document_context"]) \\ + ... .setOutputCol("answer") + + The default model is ``"bert_base_uncased_multiple_choice"``, if no name is + provided. + + For available pretrained models please see the `Models Hub + `__. + + To see which models are compatible and how to import them see + `Import Transformers into Spark NLP ๐Ÿš€ + `_. + + ====================== ====================== + Input Annotation types Output Annotation type + ====================== ====================== + ``DOCUMENT, DOCUMENT`` ``CHUNK`` + ====================== ====================== + + Parameters + ---------- + batchSize + Batch size. Large values allows faster processing but requires more + memory, by default 8 + caseSensitive + Whether to ignore case in tokens for embeddings matching, by default + False + maxSentenceLength + Max sentence length to process, by default 512 + + Examples + -------- + >>> import sparknlp + >>> from sparknlp.base import * + >>> from sparknlp.annotator import * + >>> from pyspark.ml import Pipeline + >>> documentAssembler = MultiDocumentAssembler() \\ + ... .setInputCols(["question", "context"]) \\ + ... .setOutputCols(["document_question", "document_context"]) + >>> questionAnswering = BertForMultipleChoice.pretrained() \\ + ... .setInputCols(["document_question", "document_context"]) \\ + ... .setOutputCol("answer") \\ + ... .setCaseSensitive(False) + >>> pipeline = Pipeline().setStages([ + ... documentAssembler, + ... questionAnswering + ... ]) + >>> data = spark.createDataFrame([["The Eiffel Tower is located in which country??", "Germany, France, Italy"]]).toDF("question", "context") + >>> result = pipeline.fit(data).transform(data) + >>> result.select("answer.result").show(truncate=False) + +--------------------+ + |result | + +--------------------+ + |[France] | + +--------------------+ + """ + name = "BertForMultipleChoice" + + inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.DOCUMENT] + + outputAnnotatorType = AnnotatorType.CHUNK + + choicesDelimiter = Param(Params._dummy(), + "choicesDelimiter", + "Delimiter character use to split the choices", + TypeConverters.toString) + + def setChoicesDelimiter(self, value): + """Sets delimiter character use to split the choices + + Parameters + ---------- + value : string + Delimiter character use to split the choices + """ + return self._set(caseSensitive=value) + + @keyword_only + def __init__(self, classname="com.johnsnowlabs.nlp.annotators.classifier.dl.BertForMultipleChoice", + java_model=None): + super(BertForMultipleChoice, self).__init__( + classname=classname, + java_model=java_model + ) + self._setDefault( + batchSize=4, + maxSentenceLength=512, + caseSensitive=False, + choicesDelimiter = "," + ) + + @staticmethod + def loadSavedModel(folder, spark_session): + """Loads a locally saved model. + + Parameters + ---------- + folder : str + Folder of the saved model + spark_session : pyspark.sql.SparkSession + The current SparkSession + + Returns + ------- + BertForQuestionAnswering + The restored model + """ + from sparknlp.internal import _BertMultipleChoiceLoader + jModel = _BertMultipleChoiceLoader(folder, spark_session._jsparkSession)._java_obj + return BertForMultipleChoice(java_model=jModel) + + @staticmethod + def pretrained(name="bert_base_uncased_multiple_choice", lang="en", remote_loc=None): + """Downloads and loads a pretrained model. + + Parameters + ---------- + name : str, optional + Name of the pretrained model, by default + "bert_base_uncased_multiple_choice" + lang : str, optional + Language of the pretrained model, by default "en" + remote_loc : str, optional + Optional remote address of the resource, by default None. Will use + Spark NLPs repositories otherwise. + + Returns + ------- + BertForQuestionAnswering + The restored model + """ + from sparknlp.pretrained import ResourceDownloader + return ResourceDownloader.downloadModel(BertForMultipleChoice, name, lang, remote_loc) \ No newline at end of file diff --git a/python/sparknlp/internal/__init__.py b/python/sparknlp/internal/__init__.py index 0386e5201968e4..eec3544dc41c6f 100644 --- a/python/sparknlp/internal/__init__.py +++ b/python/sparknlp/internal/__init__.py @@ -113,6 +113,13 @@ def __init__(self, path, jspark): jspark, ) +class _BertMultipleChoiceLoader(ExtendedJavaWrapper): + def __init__(self, path, jspark): + super(_BertMultipleChoiceLoader, self).__init__( + "com.johnsnowlabs.nlp.annotators.classifier.dl.BertForMultipleChoice.loadSavedModel", + path, + jspark, + ) class _DeBERTaLoader(ExtendedJavaWrapper): def __init__(self, path, jspark): diff --git a/python/test/annotator/classifier_dl/bert_for_multiple_choice_test.py b/python/test/annotator/classifier_dl/bert_for_multiple_choice_test.py new file mode 100644 index 00000000000000..369ecd44374b19 --- /dev/null +++ b/python/test/annotator/classifier_dl/bert_for_multiple_choice_test.py @@ -0,0 +1,76 @@ +# Copyright 2017-2024 John Snow Labs +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import pytest + +from sparknlp.annotator import * +from sparknlp.base import * +from test.util import SparkContextForTest + + +class BertForMultipleChoiceTestSetup(unittest.TestCase): + def setUp(self): + self.spark = SparkContextForTest.spark + self.question = "The Eiffel Tower is located in which country?" + self.choices = "Germany, France, Italy" + + self.spark = SparkContextForTest.spark + empty_df = self.spark.createDataFrame([[""]]).toDF("text") + + document_assembler = MultiDocumentAssembler() \ + .setInputCols(["question", "context"]) \ + .setOutputCols(["document_question", "document_context"]) + + bert_for_multiple_choice = BertForMultipleChoice.pretrained() \ + .setInputCols(["document_question", "document_context"]) \ + .setOutputCol("answer") \ + + pipeline = Pipeline(stages=[document_assembler, bert_for_multiple_choice]) + + self.pipeline_model = pipeline.fit(empty_df) + + +@pytest.mark.slow +class BertForMultipleChoiceTest(BertForMultipleChoiceTestSetup, unittest.TestCase): + + def setUp(self): + super().setUp() + self.data = self.spark.createDataFrame([[self.question, self.choices]]).toDF("question","context") + self.data.show(truncate=False) + + def test_run(self): + result_df = self.pipeline_model.transform(self.data) + result_df.show(truncate=False) + for row in result_df.collect(): + self.assertTrue(row["answer"][0].result != "") + + +@pytest.mark.slow +class LightBertForMultipleChoiceTest(BertForMultipleChoiceTestSetup, unittest.TestCase): + + def setUp(self): + super().setUp() + + def runTest(self): + light_pipeline = LightPipeline(self.pipeline_model) + annotations_result = light_pipeline.fullAnnotate(self.question,self.choices) + print(annotations_result) + for result in annotations_result: + self.assertTrue(result["answer"][0].result != "") + + result = light_pipeline.annotate(self.question,self.choices) + print(result) + self.assertTrue(result["answer"] != "") diff --git a/src/main/scala/com/johnsnowlabs/ml/ai/BertClassification.scala b/src/main/scala/com/johnsnowlabs/ml/ai/BertClassification.scala index e8ed6f51d2ff17..15f9345c3da88b 100644 --- a/src/main/scala/com/johnsnowlabs/ml/ai/BertClassification.scala +++ b/src/main/scala/com/johnsnowlabs/ml/ai/BertClassification.scala @@ -130,7 +130,7 @@ private[johnsnowlabs] class BertClassification( // we need the original form of the token // let's lowercase if needed right before the encoding - val basicTokenizer = new BasicTokenizer(caseSensitive = true, hasBeginEnd = false) + val basicTokenizer = new BasicTokenizer(caseSensitive = caseSensitive, hasBeginEnd = false) val encoder = new WordpieceEncoder(vocabulary) val sentences = docs.map { s => Sentence(s.result, s.begin, s.end, 0) } @@ -546,6 +546,15 @@ private[johnsnowlabs] class BertClassification( (startScores, endScores) } + override def tagSpanMultipleChoice(batch: Seq[Array[Int]]): Array[Float] = { + val logits = detectedEngine match { + case ONNX.name => computeLogitsMultipleChoiceWithOnnx(batch) + case Openvino.name => computeLogitsMultipleChoiceWithOv(batch) + } + + calculateSoftmax(logits) + } + private def computeLogitsWithTF( batch: Seq[Array[Int]], maxSentenceLength: Int): (Array[Float], Array[Float]) = { @@ -732,6 +741,87 @@ private[johnsnowlabs] class BertClassification( } } + private def computeLogitsMultipleChoiceWithOnnx(batch: Seq[Array[Int]]): Array[Float] = { + val sequenceLength = batch.head.length + val inputIds = Array(batch.map(x => x.map(_.toLong)).toArray) + val attentionMask = Array( + batch.map(sentence => sentence.map(x => if (x == 0L) 0L else 1L)).toArray) + val tokenTypeIds = Array(batch.map(_ => Array.fill(sequenceLength)(0L)).toArray) + + val (ortSession, ortEnv) = onnxWrapper.get.getSession(onnxSessionOptions) + val tokenTensors = OnnxTensor.createTensor(ortEnv, inputIds) + val maskTensors = OnnxTensor.createTensor(ortEnv, attentionMask) + val segmentTensors = OnnxTensor.createTensor(ortEnv, tokenTypeIds) + + val inputs = + Map( + "input_ids" -> tokenTensors, + "attention_mask" -> maskTensors, + "token_type_ids" -> segmentTensors).asJava + + try { + val output = ortSession.run(inputs) + try { + + val logits = output + .get("logits") + .get() + .asInstanceOf[OnnxTensor] + .getFloatBuffer + .array() + + tokenTensors.close() + maskTensors.close() + segmentTensors.close() + + logits + } finally if (output != null) output.close() + } catch { + case e: Exception => + // Log the exception as a warning + println("Exception in computeLogitsMultipleChoiceWithOnnx: ", e) + // Rethrow the exception to propagate it further + throw e + } + } + + private def computeLogitsMultipleChoiceWithOv(batch: Seq[Array[Int]]): Array[Float] = { + val (numChoices, sequenceLength) = (batch.length, batch.head.length) + // batch_size, num_choices, sequence_length + val shape = Some(Array(1, numChoices, sequenceLength)) + val (tokenTensors, maskTensors, segmentTensors) = + PrepareEmbeddings.prepareOvLongBatchTensorsWithSegment( + batch, + sequenceLength, + numChoices, + sentencePadTokenId, + shape) + + val compiledModel = openvinoWrapper.get.getCompiledModel() + val inferRequest = compiledModel.create_infer_request() + inferRequest.set_tensor("input_ids", tokenTensors) + inferRequest.set_tensor("attention_mask", maskTensors) + inferRequest.set_tensor("token_type_ids", segmentTensors) + + inferRequest.infer() + + try { + try { + val logits = inferRequest + .get_output_tensor() + .data() + + logits + } + } catch { + case e: Exception => + // Log the exception as a warning + logger.warn("Exception in computeLogitsMultipleChoiceWithOv", e) + // Rethrow the exception to propagate it further + throw e + } + } + def findIndexedToken( tokenizedSentences: Seq[TokenizedSentence], sentence: (WordpieceTokenizedSentence, Int), diff --git a/src/main/scala/com/johnsnowlabs/ml/ai/XXXForClassification.scala b/src/main/scala/com/johnsnowlabs/ml/ai/XXXForClassification.scala index 919d6aa0d17c6e..af40658d46168d 100644 --- a/src/main/scala/com/johnsnowlabs/ml/ai/XXXForClassification.scala +++ b/src/main/scala/com/johnsnowlabs/ml/ai/XXXForClassification.scala @@ -304,6 +304,43 @@ private[johnsnowlabs] trait XXXForClassification { } + def predictSpanMultipleChoice( + documents: Seq[Annotation], + choicesDelimiter: String, + maxSentenceLength: Int, + caseSensitive: Boolean): Seq[Annotation] = { + + val questionAnnotation = Seq(documents.head) + val choices = + documents.drop(1).flatMap(annotation => annotation.result.split(choicesDelimiter)) + + val wordPieceTokenizedQuestions = + tokenizeDocument(questionAnnotation, maxSentenceLength, caseSensitive) + + val inputIds = choices.flatMap { choice => + val choiceAnnotation = + Seq(Annotation(AnnotatorType.DOCUMENT, 0, choice.length, choice, Map("sentence" -> "0"))) + val wordPieceTokenizedChoice = + tokenizeDocument(choiceAnnotation, maxSentenceLength, caseSensitive) + encodeSequenceWithPadding( + wordPieceTokenizedQuestions, + wordPieceTokenizedChoice, + maxSentenceLength) + } + + val scores = tagSpanMultipleChoice(inputIds) + val (score, scoreIndex) = scores.zipWithIndex.maxBy(_._1) + val prediction = choices(scoreIndex) + + Seq( + Annotation( + annotatorType = AnnotatorType.CHUNK, + begin = 0, + end = if (prediction.isEmpty) 0 else prediction.length - 1, + result = prediction, + metadata = Map("sentence" -> "0", "chunk" -> "0", "score" -> score.toString))) + } + def tokenizeWithAlignment( sentences: Seq[TokenizedSentence], maxSeqLength: Int, @@ -362,6 +399,38 @@ private[johnsnowlabs] trait XXXForClassification { Seq(Array(sentenceStartTokenId) ++ question ++ context) } + def encodeSequenceWithPadding( + seq1: Seq[WordpieceTokenizedSentence], + seq2: Seq[WordpieceTokenizedSentence], + maxSequenceLength: Int): Seq[Array[Int]] = { + + val question = seq1.flatMap { wpTokSentence => + wpTokSentence.tokens.map(t => t.pieceId) + }.toArray + + val context = seq2.flatMap { wpTokSentence => + wpTokSentence.tokens.map(t => t.pieceId) + }.toArray + + val availableLength = maxSequenceLength - 3 // (excluding special tokens) + val truncatedQuestion = question.take(availableLength) + val remainingLength = availableLength - truncatedQuestion.length + val truncatedContext = context.take(remainingLength) + + val assembleSequence = + Array(sentenceStartTokenId) ++ truncatedQuestion ++ Array(sentenceEndTokenId) ++ + truncatedContext ++ Array(sentenceEndTokenId) + + val paddingLength = maxSequenceLength - assembleSequence.length + val paddedSequence = if (paddingLength > 0) { + assembleSequence ++ Array.fill(paddingLength)(sentencePadTokenId) + } else { + assembleSequence + } + + Seq(paddedSequence) + } + def tag(batch: Seq[Array[Int]]): Seq[Array[Array[Float]]] def tagSequence(batch: Seq[Array[Int]], activation: String): Array[Array[Float]] @@ -374,6 +443,8 @@ private[johnsnowlabs] trait XXXForClassification { def tagSpan(batch: Seq[Array[Int]]): (Array[Array[Float]], Array[Array[Float]]) + def tagSpanMultipleChoice(batch: Seq[Array[Int]]): Array[Float] = Array() + /** Calculate softmax from returned logits * @param scores * logits output from output layer diff --git a/src/main/scala/com/johnsnowlabs/ml/ai/util/PrepareEmbeddings.scala b/src/main/scala/com/johnsnowlabs/ml/ai/util/PrepareEmbeddings.scala index ddb85236678326..6529697fcfde74 100644 --- a/src/main/scala/com/johnsnowlabs/ml/ai/util/PrepareEmbeddings.scala +++ b/src/main/scala/com/johnsnowlabs/ml/ai/util/PrepareEmbeddings.scala @@ -82,18 +82,42 @@ private[johnsnowlabs] object PrepareEmbeddings { batch: Seq[Array[Int]], maxSentenceLength: Int, batchLength: Int, - sentencePadTokenId: Int = 0) + sentencePadTokenId: Int = 0, + shape: Option[Array[Int]] = None) : (org.intel.openvino.Tensor, org.intel.openvino.Tensor, org.intel.openvino.Tensor) = { - val shape = Array(batchLength, maxSentenceLength) - val tokenTensors = - new org.intel.openvino.Tensor(shape, batch.flatten.toArray) - val maskTensors = new org.intel.openvino.Tensor( - shape, - batch - .flatMap(sentence => sentence.map(x => if (x == sentencePadTokenId) 0 else 1)) - .toArray) + val tensorsShape = if (shape.isDefined) shape.get else Array(batchLength, maxSentenceLength) + val inputIds = batch.flatten.toArray + val attentionMask = batch + .flatMap(sentence => sentence.map(x => if (x == sentencePadTokenId) 0 else 1)) + .toArray + + val tokenTensors = new org.intel.openvino.Tensor(tensorsShape, inputIds) + val maskTensors = new org.intel.openvino.Tensor(tensorsShape, attentionMask) + + val segmentTensors = + new org.intel.openvino.Tensor(tensorsShape, Array.fill(batchLength * maxSentenceLength)(0)) + + (tokenTensors, maskTensors, segmentTensors) + } + + def prepareOvLongBatchTensorsWithSegment( + batch: Seq[Array[Int]], + maxSentenceLength: Int, + batchLength: Int, + sentencePadTokenId: Int = 0, + shape: Option[Array[Int]] = None) + : (org.intel.openvino.Tensor, org.intel.openvino.Tensor, org.intel.openvino.Tensor) = { + val tensorsShape = if (shape.isDefined) shape.get else Array(batchLength, maxSentenceLength) + val inputIds = batch.flatMap(x => x.map(xx => xx.toLong)).toArray + val attentionMask = batch + .flatMap(sentence => sentence.map(x => if (x == sentencePadTokenId) 0L else 1L)) + .toArray + + val tokenTensors = new org.intel.openvino.Tensor(tensorsShape, inputIds) + val maskTensors = new org.intel.openvino.Tensor(tensorsShape, attentionMask) + val segmentTensors = - new org.intel.openvino.Tensor(shape, Array.fill(batchLength * maxSentenceLength)(0)) + new org.intel.openvino.Tensor(tensorsShape, Array.fill(batchLength * maxSentenceLength)(0L)) (tokenTensors, maskTensors, segmentTensors) } diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/BertForMultipleChoice.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/BertForMultipleChoice.scala new file mode 100644 index 00000000000000..eb2bd85580ed46 --- /dev/null +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/BertForMultipleChoice.scala @@ -0,0 +1,334 @@ +/* + * Copyright 2017-2024 John Snow Labs + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.johnsnowlabs.nlp.annotators.classifier.dl + +import com.johnsnowlabs.ml.ai.BertClassification +import com.johnsnowlabs.ml.onnx.{OnnxWrapper, ReadOnnxModel, WriteOnnxModel} +import com.johnsnowlabs.ml.openvino.{OpenvinoWrapper, ReadOpenvinoModel, WriteOpenvinoModel} +import com.johnsnowlabs.ml.tensorflow.TensorflowWrapper +import com.johnsnowlabs.ml.util.LoadExternalModel.{ + loadTextAsset, + modelSanityCheck, + notSupportedEngineError +} +import com.johnsnowlabs.ml.util.{ONNX, Openvino} +import com.johnsnowlabs.nlp.serialization.MapFeature +import com.johnsnowlabs.nlp._ +import org.apache.spark.broadcast.Broadcast +import org.apache.spark.ml.param.{IntParam, Param} +import org.apache.spark.ml.util.Identifiable +import org.apache.spark.sql.SparkSession + +/** BertForMultipleChoice can load BERT Models with a multiple choice classification head on top + * (a linear layer on top of the pooled output and a softmax) e.g. for RocStories/SWAG tasks. + * + * Pretrained models can be loaded with `pretrained` of the companion object: + * {{{ + * val spanClassifier = BertForMultipleChoice.pretrained() + * .setInputCols(Array("document_question", "document_context")) + * .setOutputCol("answer") + * }}} + * The default model is `"bert_base_uncased_multiple_choice"`, if no name is provided. + * + * For available pretrained models please see the + * [[https://sparknlp.org/models?task=Multiple+Choice Models Hub]]. + * + * Models from the HuggingFace ๐Ÿค— Transformers library are also compatible with Spark NLP ๐Ÿš€. To + * see which models are compatible and how to import them see + * [[https://github.com/JohnSnowLabs/spark-nlp/discussions/5669]] and to see more extended + * examples, see + * [[https://github.com/JohnSnowLabs/spark-nlp/blob/master/src/test/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/BertForMultipleChoiceTestSpec.scala BertForMultipleChoiceTestSpec]]. + * + * ==Example== + * {{{ + * import spark.implicits._ + * import com.johnsnowlabs.nlp.base._ + * import com.johnsnowlabs.nlp.annotator._ + * import org.apache.spark.ml.Pipeline + * + * val document = new MultiDocumentAssembler() + * .setInputCols("question", "context") + * .setOutputCols("document_question", "document_context") + * + * val questionAnswering = BertForMultipleChoice.pretrained() + * .setInputCols(Array("document_question", "document_context")) + * .setOutputCol("answer") + * .setCaseSensitive(false) + * + * val pipeline = new Pipeline().setStages(Array( + * document, + * questionAnswering + * )) + * + * val data = Seq("The Eiffel Tower is located in which country?", "Germany, France, Italy").toDF("question", "context") + * val result = pipeline.fit(data).transform(data) + * + * result.select("answer.result").show(false) + * +---------------------+ + * |result | + * +---------------------+ + * |[France] | + * ++--------------------+ + * }}} + * + * @see + * [[BertForQuestionAnswering]] for Question Answering tasks + * @see + * [[https://sparknlp.org/docs/en/annotators Annotators Main Page]] for a list of transformer + * based classifiers + * @param uid + * required uid for storing annotator to disk + * @groupname anno Annotator types + * @groupdesc anno + * Required input and expected output annotator types + * @groupname Ungrouped Members + * @groupname param Parameters + * @groupname setParam Parameter setters + * @groupname getParam Parameter getters + * @groupname Ungrouped Members + * @groupprio param 1 + * @groupprio anno 2 + * @groupprio Ungrouped 3 + * @groupprio setParam 4 + * @groupprio getParam 5 + * @groupdesc param + * A list of (hyper-)parameter keys this annotator can take. Users can set and get the + * parameter values through setters and getters, respectively. + */ + +class BertForMultipleChoice(override val uid: String) + extends AnnotatorModel[BertForMultipleChoice] + with HasBatchedAnnotate[BertForMultipleChoice] + with WriteOnnxModel + with WriteOpenvinoModel + with HasCaseSensitiveProperties + with HasEngine { + + /** Annotator reference id. Used to identify elements in metadata or to refer to this annotator + * type + */ + def this() = this(Identifiable.randomUID("BertForMultipleChoice")) + + /** Annotator reference id. Used to identify elements in metadata or to refer to this annotator + * type + */ + override val inputAnnotatorTypes: Array[AnnotatorType] = + Array(AnnotatorType.DOCUMENT, AnnotatorType.DOCUMENT) + override val outputAnnotatorType: AnnotatorType = AnnotatorType.CHUNK + + /** Vocabulary used to encode the words to ids with WordPieceEncoder + * + * @group param + */ + val vocabulary: MapFeature[String, Int] = new MapFeature(this, "vocabulary").setProtected() + + /** @group setParam */ + def setVocabulary(value: Map[String, Int]): this.type = set(vocabulary, value) + + /** @group setParam */ + def sentenceStartTokenId: Int = { + $$(vocabulary)("[CLS]") + } + + /** @group setParam */ + def sentenceEndTokenId: Int = { + $$(vocabulary)("[SEP]") + } + + /** Max sentence length to process (Default: `512`) + * + * @group param + */ + val maxSentenceLength = + new IntParam(this, "maxSentenceLength", "Max sentence length to process") + + /** @group setParam */ + def setMaxSentenceLength(value: Int): this.type = { + require( + value <= 512, + "BERT models do not support sequences longer than 512 because of trainable positional embeddings.") + require(value >= 1, "The maxSentenceLength must be at least 1") + set(maxSentenceLength, value) + this + } + + val choicesDelimiter = + new Param[String](this, "choicesDelimiter", "Delimiter character use to split the choices") + + def setChoicesDelimiter(value: String): this.type = set(choicesDelimiter, value) + + private var _model: Option[Broadcast[BertClassification]] = None + + /** @group setParam */ + def setModelIfNotSet( + spark: SparkSession, + tensorflowWrapper: Option[TensorflowWrapper], + onnxWrapper: Option[OnnxWrapper], + openvinoWrapper: Option[OpenvinoWrapper]): BertForMultipleChoice = { + if (_model.isEmpty) { + _model = Some( + spark.sparkContext.broadcast( + new BertClassification( + tensorflowWrapper, + onnxWrapper, + openvinoWrapper, + sentenceStartTokenId, + sentenceEndTokenId, + configProtoBytes = None, + tags = Map.empty[String, Int], + signatures = None, + vocabulary = $$(vocabulary)))) + } + + this + } + + /** @group getParam */ + def getModelIfNotSet: BertClassification = _model.get.value + + setDefault( + batchSize -> 4, + maxSentenceLength -> 512, + caseSensitive -> false, + choicesDelimiter -> ",") + + /** takes a document and annotations and produces new annotations of this annotator's annotation + * type + * + * @param batchedAnnotations + * Annotations in batches that correspond to inputAnnotationCols generated by previous + * annotators if any + * @return + * any number of annotations processed for every batch of input annotations. Not necessary + * one to one relationship + * + * IMPORTANT: !MUST! return sequences of equal lengths !! IMPORTANT: !MUST! return sentences + * that belong to the same original row !! (challenging) + */ + override def batchAnnotate(batchedAnnotations: Seq[Array[Annotation]]): Seq[Seq[Annotation]] = { + batchedAnnotations.map(annotations => { + if (annotations.nonEmpty) { + getModelIfNotSet.predictSpanMultipleChoice( + annotations, + $(choicesDelimiter), + $(maxSentenceLength), + $(caseSensitive)) + } else { + Seq.empty[Annotation] + } + }) + } + + override def onWrite(path: String, spark: SparkSession): Unit = { + super.onWrite(path, spark) + getEngine match { + case ONNX.name => + writeOnnxModel( + path, + spark, + getModelIfNotSet.onnxWrapper.get, + "_bert_multiple_choice_classification", + BertForMultipleChoice.onnxFile) + case Openvino.name => + writeOpenvinoModel( + path, + spark, + getModelIfNotSet.openvinoWrapper.get, + "openvino_model.xml", + BertForMultipleChoice.openvinoFile) + + } + } + +} + +trait ReadablePretrainedBertForMultipleChoiceModel + extends ParamsAndFeaturesReadable[BertForMultipleChoice] + with HasPretrained[BertForMultipleChoice] { + override val defaultModelName: Some[String] = Some("bert_base_uncased_multiple_choice") + + /** Java compliant-overrides */ + override def pretrained(): BertForMultipleChoice = super.pretrained() + + override def pretrained(name: String): BertForMultipleChoice = super.pretrained(name) + + override def pretrained(name: String, lang: String): BertForMultipleChoice = + super.pretrained(name, lang) + + override def pretrained(name: String, lang: String, remoteLoc: String): BertForMultipleChoice = + super.pretrained(name, lang, remoteLoc) +} + +trait ReadBertForMultipleChoiceModel extends ReadOnnxModel with ReadOpenvinoModel { + this: ParamsAndFeaturesReadable[BertForMultipleChoice] => + + override val onnxFile: String = "bert_mc_classification_onnx" + override val openvinoFile: String = "bert_mc_classification_openvino" + + def readModel(instance: BertForMultipleChoice, path: String, spark: SparkSession): Unit = { + instance.getEngine match { + case ONNX.name => + val onnxWrapper = + readOnnxModel(path, spark, "bert_mc_classification_onnx") + instance.setModelIfNotSet(spark, None, Some(onnxWrapper), None) + case Openvino.name => + val openvinoWrapper = readOpenvinoModel(path, spark, "bert_mc_classification_ov") + instance.setModelIfNotSet(spark, None, None, Some(openvinoWrapper)) + case _ => + throw new Exception(notSupportedEngineError) + } + } + + addReader(readModel) + + def loadSavedModel(modelPath: String, spark: SparkSession): BertForMultipleChoice = { + val (localModelPath, detectedEngine) = modelSanityCheck(modelPath) + val vocabs = loadTextAsset(localModelPath, "vocab.txt").zipWithIndex.toMap + val annotatorModel = new BertForMultipleChoice().setVocabulary(vocabs) + annotatorModel.set(annotatorModel.engine, detectedEngine) + + detectedEngine match { + case ONNX.name => + val onnxWrapper = + OnnxWrapper.read(spark, localModelPath, zipped = false, useBundle = true) + annotatorModel + .setModelIfNotSet(spark, None, Some(onnxWrapper), None) + case Openvino.name => + val ovWrapper: OpenvinoWrapper = + OpenvinoWrapper.read( + spark, + localModelPath, + zipped = false, + useBundle = true, + detectedEngine = detectedEngine) + annotatorModel + .setModelIfNotSet(spark, None, None, Some(ovWrapper)) + case _ => + throw new Exception(notSupportedEngineError) + } + + annotatorModel + } + +} + +/** This is the companion object of [[BertForMultipleChoice]]. Please refer to that class for the + * documentation. + */ +object BertForMultipleChoice + extends ReadablePretrainedBertForMultipleChoiceModel + with ReadBertForMultipleChoiceModel diff --git a/src/main/scala/com/johnsnowlabs/nlp/pretrained/ResourceDownloader.scala b/src/main/scala/com/johnsnowlabs/nlp/pretrained/ResourceDownloader.scala index d0ba5238deedaa..f271566e04715a 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/pretrained/ResourceDownloader.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/pretrained/ResourceDownloader.scala @@ -688,7 +688,9 @@ object PythonResourceDownloader { "AlbertForZeroShotClassification" -> AlbertForZeroShotClassification, "MxbaiEmbeddings" -> MxbaiEmbeddings, "SnowFlakeEmbeddings" -> SnowFlakeEmbeddings, - "CamemBertForZeroShotClassification" -> CamemBertForZeroShotClassification) + "CamemBertForZeroShotClassification" -> CamemBertForZeroShotClassification, + "BertForMultipleChoice" -> BertForMultipleChoice + ) // List pairs of types such as the one with key type can load a pretrained model from the value type val typeMapper: Map[String, String] = Map("ZeroShotNerModel" -> "RoBertaForQuestionAnswering") diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/BertForMultipleChoiceTestSpec.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/BertForMultipleChoiceTestSpec.scala new file mode 100644 index 00000000000000..6aebffb53e8083 --- /dev/null +++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/BertForMultipleChoiceTestSpec.scala @@ -0,0 +1,82 @@ +/* + * Copyright 2017-2024 John Snow Labs + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.johnsnowlabs.nlp.annotators.classifier.dl + +import com.johnsnowlabs.nlp.annotators.SparkSessionTest +import com.johnsnowlabs.nlp.base.LightPipeline +import com.johnsnowlabs.nlp.{Annotation, AssertAnnotations, MultiDocumentAssembler} +import com.johnsnowlabs.tags.SlowTest +import org.apache.spark.ml.Pipeline +import org.scalatest.flatspec.AnyFlatSpec + +class BertForMultipleChoiceTestSpec extends AnyFlatSpec with SparkSessionTest { + + import spark.implicits._ + + lazy val pipelineModel = getBertForMultipleChoicePipelineModel + + val testDataframe = Seq( + ("The Eiffel Tower is located in which country?", "Germany, France, Italy")) + .toDF("question", "context") + + "BertForMultipleChoiceTestSpec" should "answer a multiple choice question" taggedAs SlowTest in { + val resultDf = pipelineModel.transform(testDataframe) + resultDf.show(truncate=false) + + val result = AssertAnnotations.getActualResult(resultDf, "answer") + result.foreach { annotation => + annotation.foreach(a => assert(a.result.nonEmpty)) + } + } + + it should "work with light pipeline fullAnnotate" taggedAs SlowTest in { + val lightPipeline = new LightPipeline(pipelineModel) + val resultFullAnnotate = lightPipeline.fullAnnotate( + "The Eiffel Tower is located in which country?", + "Germany, France, Italy") + println(s"resultAnnotate: $resultFullAnnotate") + + val answerAnnotation = resultFullAnnotate("answer").head.asInstanceOf[Annotation] + + assert(answerAnnotation.result.nonEmpty) + } + + it should "work with light pipeline annotate" taggedAs SlowTest in { + val lightPipeline = new LightPipeline(pipelineModel) + val resultAnnotate = lightPipeline.annotate( + "The Eiffel Tower is located in which country?", + "Germany, France, Italy") + println(s"resultAnnotate: $resultAnnotate") + + assert(resultAnnotate("answer").head.nonEmpty) + } + + private def getBertForMultipleChoicePipelineModel = { + val documentAssembler = new MultiDocumentAssembler() + .setInputCols("question", "context") + .setOutputCols("document_question", "document_context") + + val bertForMultipleChoice = BertForMultipleChoice.pretrained() + .setInputCols("document_question", "document_context") + .setOutputCol("answer") + + val pipeline = new Pipeline().setStages(Array(documentAssembler, bertForMultipleChoice)) + + pipeline.fit(emptyDataSet) + } + +} From fc23d43834a364309033e4ffd232fba19a5ba85c Mon Sep 17 00:00:00 2001 From: Maziyar Panahi Date: Fri, 18 Oct 2024 20:27:17 +0200 Subject: [PATCH 10/24] fixing the missing import --- .../com/johnsnowlabs/nlp/annotators/seq2seq/AutoGGUFModel.scala | 1 + 1 file changed, 1 insertion(+) diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/AutoGGUFModel.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/AutoGGUFModel.scala index 8049ba6b642473..20c9053b361d7b 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/AutoGGUFModel.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/AutoGGUFModel.scala @@ -18,6 +18,7 @@ package com.johnsnowlabs.nlp.annotators.seq2seq import com.johnsnowlabs.ml.gguf.GGUFWrapper import com.johnsnowlabs.ml.util.LlamaCPP import com.johnsnowlabs.nlp._ +import com.johnsnowlabs.nlp.llama.LlamaModel import com.johnsnowlabs.nlp.util.io.ResourceHelper import org.apache.spark.broadcast.Broadcast import org.apache.spark.ml.util.Identifiable From f792dea295cb2726d4c345865543717333784988 Mon Sep 17 00:00:00 2001 From: Devin Ha <33089471+DevinTDHa@users.noreply.github.com> Date: Mon, 21 Oct 2024 16:03:43 +0200 Subject: [PATCH 11/24] Fix pretrained models not being found on dbfs systems (#14438) --- .../johnsnowlabs/ml/gguf/GGUFWrapper.scala | 31 ++++++++++++++++++- .../annotators/seq2seq/AutoGGUFModel.scala | 21 ++----------- 2 files changed, 32 insertions(+), 20 deletions(-) diff --git a/src/main/scala/com/johnsnowlabs/ml/gguf/GGUFWrapper.scala b/src/main/scala/com/johnsnowlabs/ml/gguf/GGUFWrapper.scala index 495e8cb2a6b0f9..ef7091c3b5cd12 100644 --- a/src/main/scala/com/johnsnowlabs/ml/gguf/GGUFWrapper.scala +++ b/src/main/scala/com/johnsnowlabs/ml/gguf/GGUFWrapper.scala @@ -16,6 +16,8 @@ package com.johnsnowlabs.ml.gguf import com.johnsnowlabs.nlp.llama.{LlamaModel, ModelParameters} +import com.johnsnowlabs.nlp.util.io.ResourceHelper +import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.spark.SparkFiles import org.apache.spark.sql.SparkSession import org.slf4j.{Logger, LoggerFactory} @@ -72,7 +74,7 @@ object GGUFWrapper { // TODO: make sure this.synchronized is needed or it's not a bottleneck private def withSafeGGUFModelLoader(modelParameters: ModelParameters): LlamaModel = this.synchronized { - new LlamaModel(modelParameters) // TODO: Model parameters + new LlamaModel(modelParameters) } def read(sparkSession: SparkSession, modelPath: String): GGUFWrapper = { @@ -89,4 +91,31 @@ object GGUFWrapper { new GGUFWrapper(modelFile.getName, modelFile.getParent) } + + def readModel(modelFolderPath: String, spark: SparkSession): GGUFWrapper = { + def findGGUFModelInFolder(folderPath: String): String = { + val folder = new File(folderPath) + if (folder.exists && folder.isDirectory) { + val ggufFile: String = folder.listFiles + .filter(_.isFile) + .filter(_.getName.endsWith(".gguf")) + .map(_.getAbsolutePath) + .headOption // Should only be one file + .getOrElse( + throw new IllegalArgumentException(s"Could not find GGUF model in $folderPath")) + + new File(ggufFile).getAbsolutePath + } else { + throw new IllegalArgumentException(s"Path $folderPath is not a directory") + } + } + + val uri = new java.net.URI(modelFolderPath.replaceAllLiterally("\\", "/")) + // In case the path belongs to a different file system but doesn't have the scheme prepended (e.g. dbfs) + val fileSystem: FileSystem = FileSystem.get(uri, spark.sparkContext.hadoopConfiguration) + val actualFolderPath = fileSystem.resolvePath(new Path(modelFolderPath)).toString + val localFolder = ResourceHelper.copyToLocal(actualFolderPath) + val modelFile = findGGUFModelInFolder(localFolder) + read(spark, modelFile) + } } diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/AutoGGUFModel.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/AutoGGUFModel.scala index 20c9053b361d7b..385b9ddc0e983d 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/AutoGGUFModel.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/AutoGGUFModel.scala @@ -235,25 +235,8 @@ trait ReadAutoGGUFModel { this: ParamsAndFeaturesReadable[AutoGGUFModel] => def readModel(instance: AutoGGUFModel, path: String, spark: SparkSession): Unit = { - def findGGUFModelInFolder(): String = { - val folder = - new java.io.File( - path.replace("file:", "") - ) // File should be local at this point. TODO: Except if its HDFS? - if (folder.exists && folder.isDirectory) { - folder.listFiles - .filter(_.isFile) - .filter(_.getName.endsWith(".gguf")) - .map(_.getAbsolutePath) - .headOption // Should only be one file - .getOrElse(throw new IllegalArgumentException(s"Could not find GGUF model in $path")) - } else { - throw new IllegalArgumentException(s"Path $path is not a directory") - } - } - - val model = AutoGGUFModel.loadSavedModel(findGGUFModelInFolder(), spark) - instance.setModelIfNotSet(spark, model.getModelIfNotSet) + val model: GGUFWrapper = GGUFWrapper.readModel(path, spark) + instance.setModelIfNotSet(spark, model) } addReader(readModel) From ea28e978fc7a773b28153a141ce9b5363c9b2e53 Mon Sep 17 00:00:00 2001 From: Devin Ha <33089471+DevinTDHa@users.noreply.github.com> Date: Mon, 21 Oct 2024 16:05:26 +0200 Subject: [PATCH 12/24] [SPARKNLP-1067] PromptAssembler (#14439) * [SPARKNLP-1067] PromptAssembler Prototype * [SPARKNLP-1067] PromptAssembler Scala Side * [SPARKNLP-1067] PromptAssembler Python Side * [SPARKNLP-1067] Add example for PromptAssembler * [SPARKNLP-1067] Add documentation for PromptAssembler * [SPARKNLP-1067] Update llamacpp dependency * Switch to slowtest for promptasssembler --- build.sbt | 4 +- docs/en/annotator_entries/PromptAssembler.md | 203 +++++++++++++ docs/en/annotators.md | 1 + .../PromptAssember_with_AutoGGUFModel.ipynb | 272 ++++++++++++++++++ project/Dependencies.scala | 3 +- python/sparknlp/base/__init__.py | 1 + python/sparknlp/base/prompt_assembler.py | 207 +++++++++++++ python/test/base/__init__.py | 0 python/test/base/prompt_assembler_test.py | 113 ++++++++ .../johnsnowlabs/nlp/PromptAssembler.scala | 250 ++++++++++++++++ .../nlp/pretrained/ResourceDownloader.scala | 6 +- .../nlp/PromptAssemblerTestSpec.scala | 92 ++++++ 12 files changed, 1146 insertions(+), 6 deletions(-) create mode 100644 docs/en/annotator_entries/PromptAssembler.md create mode 100644 examples/python/llama.cpp/PromptAssember_with_AutoGGUFModel.ipynb create mode 100644 python/sparknlp/base/prompt_assembler.py create mode 100644 python/test/base/__init__.py create mode 100644 python/test/base/prompt_assembler_test.py create mode 100644 src/main/scala/com/johnsnowlabs/nlp/PromptAssembler.scala create mode 100644 src/test/scala/com/johnsnowlabs/nlp/PromptAssemblerTestSpec.scala diff --git a/build.sbt b/build.sbt index 4f35f22f8ae570..153af1f7a25e24 100644 --- a/build.sbt +++ b/build.sbt @@ -185,8 +185,8 @@ val llamaCppDependencies = Seq(llamaCppGPU) else if (is_silicon.equals("true")) Seq(llamaCppSilicon) -// else if (is_aarch64.equals("true")) -// Seq(openVinoCPU) + else if (is_aarch64.equals("true")) + Seq(llamaCppAarch64) else Seq(llamaCppCPU) diff --git a/docs/en/annotator_entries/PromptAssembler.md b/docs/en/annotator_entries/PromptAssembler.md new file mode 100644 index 00000000000000..9e4ee498900f1b --- /dev/null +++ b/docs/en/annotator_entries/PromptAssembler.md @@ -0,0 +1,203 @@ +{%- capture title -%} +PromptAssembler +{%- endcapture -%} + +{%- capture description -%} +Assembles a sequence of messages into a single string using a template. These strings can then +be used as prompts for large language models. + +This annotator expects an array of two-tuples as the type of the input column (one array of +tuples per row). The first element of the tuples should be the role and the second element is +the text of the message. Possible roles are "system", "user" and "assistant". + +An assistant header can be added to the end of the generated string by using +`setAddAssistant(true)`. + +At the moment, this annotator uses llama.cpp as a backend to parse and apply the templates. +llama.cpp uses basic pattern matching to determine the type of the template, then applies a +basic version of the template to the messages. This means that more advanced templates are not +supported. + +For an extended example see the +[example notebook](https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/llama.cpp/PromptAssember_with_AutoGGUFModel.ipynb). +{%- endcapture -%} + +{%- capture input_anno -%} +NONE +{%- endcapture -%} + +{%- capture output_anno -%} +DOCUMENT +{%- endcapture -%} + +{%- capture python_example -%} +from sparknlp.base import * + +messages = [ + [ + ("system", "You are a helpful assistant."), + ("assistant", "Hello there, how can I help you?"), + ("user", "I need help with organizing my room."), + ] +] +df = spark.createDataFrame([messages]).toDF("messages") + +{% raw %} +# llama3.1 +template = ( + "{{- bos_token }} {%- if custom_tools is defined %} {%- set tools = custom_tools %} {%- " + "endif %} {%- if not tools_in_user_message is defined %} {%- set tools_in_user_message = true %} {%- " + 'endif %} {%- if not date_string is defined %} {%- set date_string = "26 Jul 2024" %} {%- endif %} ' + "{%- if not tools is defined %} {%- set tools = none %} {%- endif %} {#- This block extracts the " + "system message, so we can slot it into the right place. #} {%- if messages[0]['role'] == 'system' %}" + " {%- set system_message = messages[0]['content']|trim %} {%- set messages = messages[1:] %} {%- else" + ' %} {%- set system_message = "" %} {%- endif %} {#- System message + builtin tools #} {{- ' + '"<|start_header_id|>system<|end_header_id|>\\n\n" }} {%- if builtin_tools is defined or tools is ' + 'not none %} {{- "Environment: ipython\\n" }} {%- endif %} {%- if builtin_tools is defined %} {{- ' + '"Tools: " + builtin_tools | reject(\'equalto\', \'code_interpreter\') | join(", ") + "\\n\n"}} ' + '{%- endif %} {{- "Cutting Knowledge Date: December 2023\\n" }} {{- "Today Date: " + date_string ' + '+ "\\n\n" }} {%- if tools is not none and not tools_in_user_message %} {{- "You have access to ' + 'the following functions. To call a function, please respond with JSON for a function call." }} {{- ' + '\'Respond in the format {"name": function name, "parameters": dictionary of argument name and its' + ' value}.\' }} {{- "Do not use variables.\\n\n" }} {%- for t in tools %} {{- t | tojson(indent=4) ' + '}} {{- "\\n\n" }} {%- endfor %} {%- endif %} {{- system_message }} {{- "<|eot_id|>" }} {#- ' + "Custom tools are passed in a user message with some extra guidance #} {%- if tools_in_user_message " + "and not tools is none %} {#- Extract the first user message so we can plug it in here #} {%- if " + "messages | length != 0 %} {%- set first_user_message = messages[0]['content']|trim %} {%- set " + 'messages = messages[1:] %} {%- else %} {{- raise_exception("Cannot put tools in the first user ' + "message when there's no first user message!\") }} {%- endif %} {{- " + "'<|start_header_id|>user<|end_header_id|>\\n\n' -}} {{- \"Given the following functions, please " + 'respond with a JSON for a function call " }} {{- "with its proper arguments that best answers the ' + 'given prompt.\\n\n" }} {{- \'Respond in the format {"name": function name, "parameters": ' + 'dictionary of argument name and its value}.\' }} {{- "Do not use variables.\\n\n" }} {%- for t in ' + 'tools %} {{- t | tojson(indent=4) }} {{- "\\n\n" }} {%- endfor %} {{- first_user_message + ' + "\"<|eot_id|>\"}} {%- endif %} {%- for message in messages %} {%- if not (message.role == 'ipython' " + "or message.role == 'tool' or 'tool_calls' in message) %} {{- '<|start_header_id|>' + message['role']" + " + '<|end_header_id|>\\n\n'+ message['content'] | trim + '<|eot_id|>' }} {%- elif 'tool_calls' in " + 'message %} {%- if not message.tool_calls|length == 1 %} {{- raise_exception("This model only ' + 'supports single tool-calls at once!") }} {%- endif %} {%- set tool_call = message.tool_calls[0]' + ".function %} {%- if builtin_tools is defined and tool_call.name in builtin_tools %} {{- " + "'<|start_header_id|>assistant<|end_header_id|>\\n\n' -}} {{- \"<|python_tag|>\" + tool_call.name + " + '".call(" }} {%- for arg_name, arg_val in tool_call.arguments | items %} {{- arg_name + \'="\' + ' + 'arg_val + \'"\' }} {%- if not loop.last %} {{- ", " }} {%- endif %} {%- endfor %} {{- ")" }} {%- ' + "else %} {{- '<|start_header_id|>assistant<|end_header_id|>\\n\n' -}} {{- '{\"name\": \"' + " + 'tool_call.name + \'", \' }} {{- \'"parameters": \' }} {{- tool_call.arguments | tojson }} {{- "}" ' + "}} {%- endif %} {%- if builtin_tools is defined %} {#- This means we're in ipython mode #} {{- " + '"<|eom_id|>" }} {%- else %} {{- "<|eot_id|>" }} {%- endif %} {%- elif message.role == "tool" ' + 'or message.role == "ipython" %} {{- "<|start_header_id|>ipython<|end_header_id|>\\n\n" }} {%- ' + "if message.content is mapping or message.content is iterable %} {{- message.content | tojson }} {%- " + 'else %} {{- message.content }} {%- endif %} {{- "<|eot_id|>" }} {%- endif %} {%- endfor %} {%- if ' + "add_generation_prompt %} {{- '<|start_header_id|>assistant<|end_header_id|>\\n\n' }} {%- endif %} " +) +{% endraw %} + +prompt_assembler = ( + PromptAssembler() + .setInputCol("messages") + .setOutputCol("prompt") + .setChatTemplate(template) +) + +prompt_assembler.transform(df).select("prompt.result").show(truncate=False) ++----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +|result | ++----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +|[<|start_header_id|>system<|end_header_id|>\n\nYou are a helpful assistant.<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nHello there, how can I help you?<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nI need help with organizing my room.<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n]| ++----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +{%- endcapture -%} + +{%- capture scala_example -%} +// Batches (whole conversations) of arrays of messages +val data: Seq[Seq[(String, String)]] = Seq( + Seq( + ("system", "You are a helpful assistant."), + ("assistant", "Hello there, how can I help you?"), + ("user", "I need help with organizing my room."))) + +val dataDF = data.toDF("messages") + +{% raw %} +// llama3.1 +val template = + "{{- bos_token }} {%- if custom_tools is defined %} {%- set tools = custom_tools %} {%- " + + "endif %} {%- if not tools_in_user_message is defined %} {%- set tools_in_user_message = true %} {%- " + + "endif %} {%- if not date_string is defined %} {%- set date_string = \"26 Jul 2024\" %} {%- endif %} " + + "{%- if not tools is defined %} {%- set tools = none %} {%- endif %} {#- This block extracts the " + + "system message, so we can slot it into the right place. #} {%- if messages[0]['role'] == 'system' %}" + + " {%- set system_message = messages[0]['content']|trim %} {%- set messages = messages[1:] %} {%- else" + + " %} {%- set system_message = \"\" %} {%- endif %} {#- System message + builtin tools #} {{- " + + "\"<|start_header_id|>system<|end_header_id|>\\n\\n\" }} {%- if builtin_tools is defined or tools is " + + "not none %} {{- \"Environment: ipython\\n\" }} {%- endif %} {%- if builtin_tools is defined %} {{- " + + "\"Tools: \" + builtin_tools | reject('equalto', 'code_interpreter') | join(\", \") + \"\\n\\n\"}} " + + "{%- endif %} {{- \"Cutting Knowledge Date: December 2023\\n\" }} {{- \"Today Date: \" + date_string " + + "+ \"\\n\\n\" }} {%- if tools is not none and not tools_in_user_message %} {{- \"You have access to " + + "the following functions. To call a function, please respond with JSON for a function call.\" }} {{- " + + "'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its" + + " value}.' }} {{- \"Do not use variables.\\n\\n\" }} {%- for t in tools %} {{- t | tojson(indent=4) " + + "}} {{- \"\\n\\n\" }} {%- endfor %} {%- endif %} {{- system_message }} {{- \"<|eot_id|>\" }} {#- " + + "Custom tools are passed in a user message with some extra guidance #} {%- if tools_in_user_message " + + "and not tools is none %} {#- Extract the first user message so we can plug it in here #} {%- if " + + "messages | length != 0 %} {%- set first_user_message = messages[0]['content']|trim %} {%- set " + + "messages = messages[1:] %} {%- else %} {{- raise_exception(\"Cannot put tools in the first user " + + "message when there's no first user message!\") }} {%- endif %} {{- " + + "'<|start_header_id|>user<|end_header_id|>\\n\\n' -}} {{- \"Given the following functions, please " + + "respond with a JSON for a function call \" }} {{- \"with its proper arguments that best answers the " + + "given prompt.\\n\\n\" }} {{- 'Respond in the format {\"name\": function name, \"parameters\": " + + "dictionary of argument name and its value}.' }} {{- \"Do not use variables.\\n\\n\" }} {%- for t in " + + "tools %} {{- t | tojson(indent=4) }} {{- \"\\n\\n\" }} {%- endfor %} {{- first_user_message + " + + "\"<|eot_id|>\"}} {%- endif %} {%- for message in messages %} {%- if not (message.role == 'ipython' " + + "or message.role == 'tool' or 'tool_calls' in message) %} {{- '<|start_header_id|>' + message['role']" + + " + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }} {%- elif 'tool_calls' in " + + "message %} {%- if not message.tool_calls|length == 1 %} {{- raise_exception(\"This model only " + + "supports single tool-calls at once!\") }} {%- endif %} {%- set tool_call = message.tool_calls[0]" + + ".function %} {%- if builtin_tools is defined and tool_call.name in builtin_tools %} {{- " + + "'<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}} {{- \"<|python_tag|>\" + tool_call.name + " + + "\".call(\" }} {%- for arg_name, arg_val in tool_call.arguments | items %} {{- arg_name + '=\"' + " + + "arg_val + '\"' }} {%- if not loop.last %} {{- \", \" }} {%- endif %} {%- endfor %} {{- \")\" }} {%- " + + "else %} {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}} {{- '{\"name\": \"' + " + + "tool_call.name + '\", ' }} {{- '\"parameters\": ' }} {{- tool_call.arguments | tojson }} {{- \"}\" " + + "}} {%- endif %} {%- if builtin_tools is defined %} {#- This means we're in ipython mode #} {{- " + + "\"<|eom_id|>\" }} {%- else %} {{- \"<|eot_id|>\" }} {%- endif %} {%- elif message.role == \"tool\" " + + "or message.role == \"ipython\" %} {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }} {%- " + + "if message.content is mapping or message.content is iterable %} {{- message.content | tojson }} {%- " + + "else %} {{- message.content }} {%- endif %} {{- \"<|eot_id|>\" }} {%- endif %} {%- endfor %} {%- if " + + "add_generation_prompt %} {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }} {%- endif %} " +{% endraw %} + +val promptAssembler = new PromptAssembler() + .setInputCol("messages") + .setOutputCol("prompt") + .setChatTemplate(template) + +promptAssembler.transform(dataDF).select("prompt.result").show(truncate = false) ++----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +|result | ++----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +|[<|start_header_id|>system<|end_header_id|>\n\nYou are a helpful assistant.<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nHello there, how can I help you?<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nI need help with organizing my room.<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n]| ++----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ + +{%- endcapture -%} + +{%- capture api_link -%} +[PromptAssembler](/api/com/johnsnowlabs/nlp/PromptAssembler) +{%- endcapture -%} + +{%- capture python_api_link -%} +[PromptAssembler](/api/python/reference/autosummary/sparknlp/base/prompt_assembler/index.html#sparknlp.base.prompt_assembler.PromptAssembler) +{%- endcapture -%} + +{%- capture source_link -%} +[PromptAssembler](https://github.com/JohnSnowLabs/spark-nlp/tree/master/src/main/scala/com/johnsnowlabs/nlp/PromptAssembler.scala) +{%- endcapture -%} + +{% include templates/anno_template.md +title=title +description=description +input_anno=input_anno +output_anno=output_anno +python_example=python_example +scala_example=scala_example +api_link=api_link +python_api_link=python_api_link +source_link=source_link +%} \ No newline at end of file diff --git a/docs/en/annotators.md b/docs/en/annotators.md index 161bd8f8e3f496..4526453a7ebc94 100644 --- a/docs/en/annotators.md +++ b/docs/en/annotators.md @@ -83,6 +83,7 @@ There are two types of Annotators: {% include templates/anno_table_entry.md path="" name="Normalizer" summary="Removes all dirty characters from text following a regex pattern and transforms words based on a provided dictionary."%} {% include templates/anno_table_entry.md path="" name="NorvigSweeting Spellchecker" summary="Retrieves tokens and makes corrections automatically if not found in an English dictionary."%} {% include templates/anno_table_entry.md path="" name="POSTagger (Part of speech tagger)" summary="Averaged Perceptron model to tag words part-of-speech."%} +{% include templates/anno_table_entry.md path="" name="PromptAssembler" summary="Assembles a sequence of messages into a single string using a template."%} {% include templates/anno_table_entry.md path="" name="RecursiveTokenizer" summary="Tokenizes raw text recursively based on a handful of definable rules."%} {% include templates/anno_table_entry.md path="" name="RegexMatcher" summary="Uses rules to match a set of regular expressions and associate them with a provided identifier."%} {% include templates/anno_table_entry.md path="" name="RegexTokenizer" summary="A tokenizer that splits text by a regex pattern."%} diff --git a/examples/python/llama.cpp/PromptAssember_with_AutoGGUFModel.ipynb b/examples/python/llama.cpp/PromptAssember_with_AutoGGUFModel.ipynb new file mode 100644 index 00000000000000..9eb0f1884e8bb7 --- /dev/null +++ b/examples/python/llama.cpp/PromptAssember_with_AutoGGUFModel.ipynb @@ -0,0 +1,272 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![JohnSnowLabs](https://sparknlp.org/assets/images/logo.png)\n", + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/llama.cpp/PromptAssember_with_AutoGGUFModel.ipynb)\n", + "\n", + "# PromptAssembler with AutoGGUFModel\n", + "\n", + "Let's keep in mind a few things before we start ๐Ÿ˜Š\n", + "\n", + "- llama.cpp support in the form of the `AutoGGUFModel` was introduced in `Spark NLP 5.5.0`, enabling quantized LLM inference on a wide range of devices. Please make sure you have upgraded to the latest Spark NLP release.\n", + "- The `PromptAssembler` was introduced in `Spark NLP 5.5.1` to enable the construction of message prompts.\n", + "\n", + "This notebook will show you how you can construct your own message prompts for the AutoGGUFModel." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Install and Start Spark NLP\n", + "\n", + "- Let's install and setup Spark NLP (if running it Google Colab)\n", + "- This part is pretty easy via our simple script" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Only execute this if you are on Google Colab\n", + "! wget -q http://setup.johnsnowlabs.com/colab.sh -O - | bash" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's start Spark with Spark NLP included via our simple `start()` function" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import sparknlp\n", + "\n", + "# let's start Spark with Spark NLP with GPU enabled. If you don't have GPUs available remove this parameter.\n", + "spark = sparknlp.start(gpu=True)\n", + "print(sparknlp.version())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's create a `PromptAssembler` and use it to recreate the following conversation between a chatbot and a user:\n", + "\n", + "```\n", + "SYSTEM: You are a helpful assistant.\n", + "ASSISTANT: Hello there! How can I help you today?\n", + "USER: I need help with organizing my room, give me some advice.\n", + "```\n", + "\n", + "First we need to structure our messages in our Spark DataFrame correctly. For each row, the PromptAssembler expects an array of two-tuples. The first field should be the role and the second field the message. We will call this column `message`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+----------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "|messages |\n", + "+----------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "|[{system, You are a helpful assistant.}, {assistant, Hello there! How can I help you today?}, {user, I need help with organizing my room, give me some advice.}]|\n", + "+----------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "\n" + ] + } + ], + "source": [ + "messages = [\n", + " (\"system\", \"You are a helpful assistant.\"),\n", + " (\"assistant\", \"Hello there! How can I help you today?\"),\n", + " (\"user\", \"I need help with organizing my room, give me some advice.\"),\n", + "]\n", + "df = spark.createDataFrame([[messages]]).toDF(\"messages\")\n", + "df.show(truncate=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's create the PromptAssembler to generate the prompts. We will use the template from [llama3.1 (extracted from the gguf file)](https://huggingface.co/bartowski/Meta-Llama-3.1-8B-Instruct-GGUF?show_file_info=Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf).\n", + "\n", + "By default, the `addAssistant` parameter is set to `True`, so a assistant header will be appended to the end." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sparknlp.base import *\n", + "\n", + "\n", + "template = (\n", + " \"{{- bos_token }} {%- if custom_tools is defined %} {%- set tools = custom_tools %} {%- \"\n", + " \"endif %} {%- if not tools_in_user_message is defined %} {%- set tools_in_user_message = true %} {%- \"\n", + " 'endif %} {%- if not date_string is defined %} {%- set date_string = \"26 Jul 2024\" %} {%- endif %} '\n", + " \"{%- if not tools is defined %} {%- set tools = none %} {%- endif %} {#- This block extracts the \"\n", + " \"system message, so we can slot it into the right place. #} {%- if messages[0]['role'] == 'system' %}\"\n", + " \" {%- set system_message = messages[0]['content']|trim %} {%- set messages = messages[1:] %} {%- else\"\n", + " ' %} {%- set system_message = \"\" %} {%- endif %} {#- System message + builtin tools #} {{- '\n", + " '\"<|start_header_id|>system<|end_header_id|>\\\\n\\\\n\" }} {%- if builtin_tools is defined or tools is '\n", + " 'not none %} {{- \"Environment: ipython\\\\n\" }} {%- endif %} {%- if builtin_tools is defined %} {{- '\n", + " '\"Tools: \" + builtin_tools | reject(\\'equalto\\', \\'code_interpreter\\') | join(\", \") + \"\\\\n\\\\n\"}} '\n", + " '{%- endif %} {{- \"Cutting Knowledge Date: December 2023\\\\n\" }} {{- \"Today Date: \" + date_string '\n", + " '+ \"\\\\n\\\\n\" }} {%- if tools is not none and not tools_in_user_message %} {{- \"You have access to '\n", + " 'the following functions. To call a function, please respond with JSON for a function call.\" }} {{- '\n", + " '\\'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its'\n", + " ' value}.\\' }} {{- \"Do not use variables.\\\\n\\\\n\" }} {%- for t in tools %} {{- t | tojson(indent=4) '\n", + " '}} {{- \"\\\\n\\\\n\" }} {%- endfor %} {%- endif %} {{- system_message }} {{- \"<|eot_id|>\" }} {#- '\n", + " \"Custom tools are passed in a user message with some extra guidance #} {%- if tools_in_user_message \"\n", + " \"and not tools is none %} {#- Extract the first user message so we can plug it in here #} {%- if \"\n", + " \"messages | length != 0 %} {%- set first_user_message = messages[0]['content']|trim %} {%- set \"\n", + " 'messages = messages[1:] %} {%- else %} {{- raise_exception(\"Cannot put tools in the first user '\n", + " \"message when there's no first user message!\\\") }} {%- endif %} {{- \"\n", + " \"'<|start_header_id|>user<|end_header_id|>\\\\n\\\\n' -}} {{- \\\"Given the following functions, please \"\n", + " 'respond with a JSON for a function call \" }} {{- \"with its proper arguments that best answers the '\n", + " 'given prompt.\\\\n\\\\n\" }} {{- \\'Respond in the format {\"name\": function name, \"parameters\": '\n", + " 'dictionary of argument name and its value}.\\' }} {{- \"Do not use variables.\\\\n\\\\n\" }} {%- for t in '\n", + " 'tools %} {{- t | tojson(indent=4) }} {{- \"\\\\n\\\\n\" }} {%- endfor %} {{- first_user_message + '\n", + " \"\\\"<|eot_id|>\\\"}} {%- endif %} {%- for message in messages %} {%- if not (message.role == 'ipython' \"\n", + " \"or message.role == 'tool' or 'tool_calls' in message) %} {{- '<|start_header_id|>' + message['role']\"\n", + " \" + '<|end_header_id|>\\\\n\\\\n'+ message['content'] | trim + '<|eot_id|>' }} {%- elif 'tool_calls' in \"\n", + " 'message %} {%- if not message.tool_calls|length == 1 %} {{- raise_exception(\"This model only '\n", + " 'supports single tool-calls at once!\") }} {%- endif %} {%- set tool_call = message.tool_calls[0]'\n", + " \".function %} {%- if builtin_tools is defined and tool_call.name in builtin_tools %} {{- \"\n", + " \"'<|start_header_id|>assistant<|end_header_id|>\\\\n\\\\n' -}} {{- \\\"<|python_tag|>\\\" + tool_call.name + \"\n", + " '\".call(\" }} {%- for arg_name, arg_val in tool_call.arguments | items %} {{- arg_name + \\'=\"\\' + '\n", + " 'arg_val + \\'\"\\' }} {%- if not loop.last %} {{- \", \" }} {%- endif %} {%- endfor %} {{- \")\" }} {%- '\n", + " \"else %} {{- '<|start_header_id|>assistant<|end_header_id|>\\\\n\\\\n' -}} {{- '{\\\"name\\\": \\\"' + \"\n", + " 'tool_call.name + \\'\", \\' }} {{- \\'\"parameters\": \\' }} {{- tool_call.arguments | tojson }} {{- \"}\" '\n", + " \"}} {%- endif %} {%- if builtin_tools is defined %} {#- This means we're in ipython mode #} {{- \"\n", + " '\"<|eom_id|>\" }} {%- else %} {{- \"<|eot_id|>\" }} {%- endif %} {%- elif message.role == \"tool\" '\n", + " 'or message.role == \"ipython\" %} {{- \"<|start_header_id|>ipython<|end_header_id|>\\\\n\\\\n\" }} {%- '\n", + " \"if message.content is mapping or message.content is iterable %} {{- message.content | tojson }} {%- \"\n", + " 'else %} {{- message.content }} {%- endif %} {{- \"<|eot_id|>\" }} {%- endif %} {%- endfor %} {%- if '\n", + " \"add_generation_prompt %} {{- '<|start_header_id|>assistant<|end_header_id|>\\\\n\\\\n' }} {%- endif %} \"\n", + ")\n", + "\n", + "promptAssembler = (\n", + " PromptAssembler()\n", + " .setInputCol(\"messages\")\n", + " .setOutputCol(\"prompt\")\n", + " .setChatTemplate(template)\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's see how the final prompt looks like." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[Stage 8:> (0 + 1) / 1]\r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "jsl-llama: Extracted 'libjllama.so' to '/tmp/libjllama.so'\n", + "+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "|result |\n", + "+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "|[<|start_header_id|>system<|end_header_id|>\\n\\nYou are a helpful assistant.<|eot_id|><|start_header_id|>assistant<|end_header_id|>\\n\\nHello there! How can I help you today?<|eot_id|><|start_header_id|>user<|end_header_id|>\\n\\nI need help with organizing my room, give me some advice.<|eot_id|><|start_header_id|>assistant<|end_header_id|>\\n\\n]|\n", + "+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + } + ], + "source": [ + "promptAssembler.transform(df).select(\"prompt.result\").show(truncate=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now you can feed the prompt to a llama3.1 model loaded with AutoGGUFModel. Depending on your messages, you might need to the chat template or system prompt in the AutoGGUFModel. For example:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sparknlp.annotator import AutoGGUFModel\n", + "\n", + "autoGGUFModel = (\n", + " AutoGGUFModel.loadSavedModel(\"path/to/llama3.1\", spark)\n", + " .setInputCols(\"prompt\")\n", + " .setOutputCol(\"completions\")\n", + " .setBatchSize(4)\n", + " .setNGpuLayers(99)\n", + " .setUseChatTemplate(False) # Don't apply the chat template\n", + " .setSystemPrompt(\n", + " \"Your system prompt\"\n", + " ) # Set custom system prompt if not specified in the messages. Leave empty for default.\n", + ")" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/project/Dependencies.scala b/project/Dependencies.scala index f4a0048b6841ae..1a8bb8c8a1ceaa 100644 --- a/project/Dependencies.scala +++ b/project/Dependencies.scala @@ -128,10 +128,11 @@ object Dependencies { val azureIdentity = "com.azure" % "azure-identity" % azureIdentityVersion % Provided val azureStorage = "com.azure" % "azure-storage-blob" % azureStorageVersion % Provided - val llamaCppVersion = "0.1.1-rc2" + val llamaCppVersion = "0.1.4" val llamaCppCPU = "com.johnsnowlabs.nlp" %% "jsl-llamacpp-cpu" % llamaCppVersion val llamaCppGPU = "com.johnsnowlabs.nlp" %% "jsl-llamacpp-gpu" % llamaCppVersion val llamaCppSilicon = "com.johnsnowlabs.nlp" %% "jsl-llamacpp-silicon" % llamaCppVersion + val llamaCppAarch64 = "com.johnsnowlabs.nlp" %% "jsl-llamacpp-aarch64" % llamaCppVersion /** ------- Dependencies end ------- */ } diff --git a/python/sparknlp/base/__init__.py b/python/sparknlp/base/__init__.py index adc4acd52ed338..f4fbeadc55e91d 100644 --- a/python/sparknlp/base/__init__.py +++ b/python/sparknlp/base/__init__.py @@ -26,3 +26,4 @@ from sparknlp.base.image_assembler import * from sparknlp.base.audio_assembler import * from sparknlp.base.table_assembler import * +from sparknlp.base.prompt_assembler import * diff --git a/python/sparknlp/base/prompt_assembler.py b/python/sparknlp/base/prompt_assembler.py new file mode 100644 index 00000000000000..ae2a26a1f65003 --- /dev/null +++ b/python/sparknlp/base/prompt_assembler.py @@ -0,0 +1,207 @@ +# Copyright 2017-2024 John Snow Labs +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Contains classes for the PromptAssembler.""" + +from pyspark import keyword_only +from pyspark.ml.param import TypeConverters, Params, Param + +from sparknlp.common import AnnotatorType +from sparknlp.internal import AnnotatorTransformer + + +class PromptAssembler(AnnotatorTransformer): + """Assembles a sequence of messages into a single string using a template. These strings can then + be used as prompts for large language models. + + This annotator expects an array of two-tuples as the type of the input column (one array of + tuples per row). The first element of the tuples should be the role and the second element is + the text of the message. Possible roles are "system", "user" and "assistant". + + An assistant header can be added to the end of the generated string by using + ``setAddAssistant(True)``. + + At the moment, this annotator uses llama.cpp as a backend to parse and apply the templates. + llama.cpp uses basic pattern matching to determine the type of the template, then applies a + basic version of the template to the messages. This means that more advanced templates are not + supported. + + For an extended example see the + `example notebook `__. + + ====================== ====================== + Input Annotation types Output Annotation type + ====================== ====================== + ``NONE`` ``DOCUMENT`` + ====================== ====================== + + Parameters + ---------- + inputCol + Input column name + outputCol + Output column name + chatTemplate + Template used for the chat + addAssistant + Whether to add an assistant header to the end of the generated string + + Examples + -------- + >>> from sparknlp.base import * + >>> messages = [ + ... [ + ... ("system", "You are a helpful assistant."), + ... ("assistant", "Hello there, how can I help you?"), + ... ("user", "I need help with organizing my room."), + ... ] + ... ] + >>> df = spark.createDataFrame([messages]).toDF("messages") + >>> template = ( + ... "{{- bos_token }} {%- if custom_tools is defined %} {%- set tools = custom_tools %} {%- " + ... "endif %} {%- if not tools_in_user_message is defined %} {%- set tools_in_user_message = true %} {%- " + ... 'endif %} {%- if not date_string is defined %} {%- set date_string = "26 Jul 2024" %} {%- endif %} ' + ... "{%- if not tools is defined %} {%- set tools = none %} {%- endif %} {#- This block extracts the " + ... "system message, so we can slot it into the right place. #} {%- if messages[0]['role'] == 'system' %}" + ... " {%- set system_message = messages[0]['content']|trim %} {%- set messages = messages[1:] %} {%- else" + ... ' %} {%- set system_message = "" %} {%- endif %} {#- System message + builtin tools #} {{- ' + ... '"<|start_header_id|>system<|end_header_id|>\\n\\n" }} {%- if builtin_tools is defined or tools is ' + ... 'not none %} {{- "Environment: ipython\\n" }} {%- endif %} {%- if builtin_tools is defined %} {{- ' + ... '"Tools: " + builtin_tools | reject(\\'equalto\', \\'code_interpreter\\') | join(", ") + "\\n\\n"}} ' + ... '{%- endif %} {{- "Cutting Knowledge Date: December 2023\\n" }} {{- "Today Date: " + date_string ' + ... '+ "\\n\\n" }} {%- if tools is not none and not tools_in_user_message %} {{- "You have access to ' + ... 'the following functions. To call a function, please respond with JSON for a function call." }} {{- ' + ... '\\'Respond in the format {"name": function name, "parameters": dictionary of argument name and its' + ... ' value}.\\' }} {{- "Do not use variables.\\n\\n" }} {%- for t in tools %} {{- t | tojson(indent=4) ' + ... '}} {{- "\\n\\n" }} {%- endfor %} {%- endif %} {{- system_message }} {{- "<|eot_id|>" }} {#- ' + ... "Custom tools are passed in a user message with some extra guidance #} {%- if tools_in_user_message " + ... "and not tools is none %} {#- Extract the first user message so we can plug it in here #} {%- if " + ... "messages | length != 0 %} {%- set first_user_message = messages[0]['content']|trim %} {%- set " + ... 'messages = messages[1:] %} {%- else %} {{- raise_exception("Cannot put tools in the first user ' + ... "message when there's no first user message!\\") }} {%- endif %} {{- " + ... "'<|start_header_id|>user<|end_header_id|>\\n\\n' -}} {{- \\"Given the following functions, please " + ... 'respond with a JSON for a function call " }} {{- "with its proper arguments that best answers the ' + ... 'given prompt.\\n\\n" }} {{- \\'Respond in the format {"name": function name, "parameters": ' + ... 'dictionary of argument name and its value}.\\' }} {{- "Do not use variables.\\n\\n" }} {%- for t in ' + ... 'tools %} {{- t | tojson(indent=4) }} {{- "\\n\\n" }} {%- endfor %} {{- first_user_message + ' + ... "\\"<|eot_id|>\\"}} {%- endif %} {%- for message in messages %} {%- if not (message.role == 'ipython' " + ... "or message.role == 'tool' or 'tool_calls' in message) %} {{- '<|start_header_id|>' + message['role']" + ... " + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }} {%- elif 'tool_calls' in " + ... 'message %} {%- if not message.tool_calls|length == 1 %} {{- raise_exception("This model only ' + ... 'supports single tool-calls at once!") }} {%- endif %} {%- set tool_call = message.tool_calls[0]' + ... ".function %} {%- if builtin_tools is defined and tool_call.name in builtin_tools %} {{- " + ... "'<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}} {{- \\"<|python_tag|>\\" + tool_call.name + " + ... '".call(" }} {%- for arg_name, arg_val in tool_call.arguments | items %} {{- arg_name + \\'="\\' + ' + ... 'arg_val + \\'"\\' }} {%- if not loop.last %} {{- ", " }} {%- endif %} {%- endfor %} {{- ")" }} {%- ' + ... "else %} {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}} {{- '{\\"name\": \\"' + " + ... 'tool_call.name + \\'", \\' }} {{- \\'"parameters": \\' }} {{- tool_call.arguments | tojson }} {{- "}" ' + ... "}} {%- endif %} {%- if builtin_tools is defined %} {#- This means we're in ipython mode #} {{- " + ... '"<|eom_id|>" }} {%- else %} {{- "<|eot_id|>" }} {%- endif %} {%- elif message.role == "tool" ' + ... 'or message.role == "ipython" %} {{- "<|start_header_id|>ipython<|end_header_id|>\\n\\n" }} {%- ' + ... "if message.content is mapping or message.content is iterable %} {{- message.content | tojson }} {%- " + ... 'else %} {{- message.content }} {%- endif %} {{- "<|eot_id|>" }} {%- endif %} {%- endfor %} {%- if ' + ... "add_generation_prompt %} {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }} {%- endif %} " + ... ) + >>> prompt_assembler = ( + ... PromptAssembler() + ... .setInputCol("messages") + ... .setOutputCol("prompt") + ... .setChatTemplate(template) + ... ) + >>> prompt_assembler.transform(df).select("prompt.result").show(truncate=False) + +----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ + |result | + +----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ + |[<|start_header_id|>system<|end_header_id|>\n\nYou are a helpful assistant.<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nHello there, how can I help you?<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nI need help with organizing my room.<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n]| + +----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ + """ + + outputAnnotatorType = AnnotatorType.DOCUMENT + + inputCol = Param( + Params._dummy(), + "inputCol", + "input column name", + typeConverter=TypeConverters.toString, + ) + outputCol = Param( + Params._dummy(), + "outputCol", + "output column name", + typeConverter=TypeConverters.toString, + ) + chatTemplate = Param( + Params._dummy(), + "chatTemplate", + "Template used for the chat", + typeConverter=TypeConverters.toString, + ) + addAssistant = Param( + Params._dummy(), + "addAssistant", + "Whether to add an assistant header to the end of the generated string", + typeConverter=TypeConverters.toBoolean, + ) + name = "PromptAssembler" + + @keyword_only + def __init__(self): + super(PromptAssembler, self).__init__( + classname="com.johnsnowlabs.nlp.PromptAssembler" + ) + self._setDefault(outputCol="prompt", addAssistant=True) + + @keyword_only + def setParams(self): + kwargs = self._input_kwargs + return self._set(**kwargs) + + def setInputCol(self, value): + """Sets input column name. + + Parameters + ---------- + value : str + Name of the input column + """ + return self._set(inputCol=value) + + def setOutputCol(self, value): + """Sets output column name. + + Parameters + ---------- + value : str + Name of the Output Column + """ + return self._set(outputCol=value) + + def setChatTemplate(self, value): + """Sets the chat template. + + Parameters + ---------- + value : str + Template used for the chat + """ + return self._set(chatTemplate=value) + + def setAddAssistant(self, value): + """Sets whether to add an assistant header to the end of the generated string. + + Parameters + ---------- + value : bool + Whether to add an assistant header to the end of the generated string + """ + return self._set(addAssistant=value) diff --git a/python/test/base/__init__.py b/python/test/base/__init__.py new file mode 100644 index 00000000000000..e69de29bb2d1d6 diff --git a/python/test/base/prompt_assembler_test.py b/python/test/base/prompt_assembler_test.py new file mode 100644 index 00000000000000..c9bce510972b0b --- /dev/null +++ b/python/test/base/prompt_assembler_test.py @@ -0,0 +1,113 @@ +# Copyright 2017-2024 John Snow Labs +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import unittest + +import pytest + +from sparknlp.base import * +from test.util import SparkSessionForTest + + +@pytest.mark.slow +class PromptAssemblerTest(unittest.TestCase): + def setUp(self): + self.spark = SparkSessionForTest.spark + messages = [ + [ + ("system", "You are a helpful assistant."), + ("assistant", "Hello there, how can I help you?"), + ("user", "I need help with organizing my room."), + ] + ] + self.df = self.spark.createDataFrame([messages]).toDF("messages") + + # llama3.1 + self.template = ( + "{{- bos_token }} {%- if custom_tools is defined %} {%- set tools = custom_tools %} {%- " + "endif %} {%- if not tools_in_user_message is defined %} {%- set tools_in_user_message = true %} {%- " + 'endif %} {%- if not date_string is defined %} {%- set date_string = "26 Jul 2024" %} {%- endif %} ' + "{%- if not tools is defined %} {%- set tools = none %} {%- endif %} {#- This block extracts the " + "system message, so we can slot it into the right place. #} {%- if messages[0]['role'] == 'system' %}" + " {%- set system_message = messages[0]['content']|trim %} {%- set messages = messages[1:] %} {%- else" + ' %} {%- set system_message = "" %} {%- endif %} {#- System message + builtin tools #} {{- ' + '"<|start_header_id|>system<|end_header_id|>\\n\\n" }} {%- if builtin_tools is defined or tools is ' + 'not none %} {{- "Environment: ipython\\n" }} {%- endif %} {%- if builtin_tools is defined %} {{- ' + '"Tools: " + builtin_tools | reject(\'equalto\', \'code_interpreter\') | join(", ") + "\\n\\n"}} ' + '{%- endif %} {{- "Cutting Knowledge Date: December 2023\\n" }} {{- "Today Date: " + date_string ' + '+ "\\n\\n" }} {%- if tools is not none and not tools_in_user_message %} {{- "You have access to ' + 'the following functions. To call a function, please respond with JSON for a function call." }} {{- ' + '\'Respond in the format {"name": function name, "parameters": dictionary of argument name and its' + ' value}.\' }} {{- "Do not use variables.\\n\\n" }} {%- for t in tools %} {{- t | tojson(indent=4) ' + '}} {{- "\\n\\n" }} {%- endfor %} {%- endif %} {{- system_message }} {{- "<|eot_id|>" }} {#- ' + "Custom tools are passed in a user message with some extra guidance #} {%- if tools_in_user_message " + "and not tools is none %} {#- Extract the first user message so we can plug it in here #} {%- if " + "messages | length != 0 %} {%- set first_user_message = messages[0]['content']|trim %} {%- set " + 'messages = messages[1:] %} {%- else %} {{- raise_exception("Cannot put tools in the first user ' + "message when there's no first user message!\") }} {%- endif %} {{- " + "'<|start_header_id|>user<|end_header_id|>\\n\\n' -}} {{- \"Given the following functions, please " + 'respond with a JSON for a function call " }} {{- "with its proper arguments that best answers the ' + 'given prompt.\\n\\n" }} {{- \'Respond in the format {"name": function name, "parameters": ' + 'dictionary of argument name and its value}.\' }} {{- "Do not use variables.\\n\\n" }} {%- for t in ' + 'tools %} {{- t | tojson(indent=4) }} {{- "\\n\\n" }} {%- endfor %} {{- first_user_message + ' + "\"<|eot_id|>\"}} {%- endif %} {%- for message in messages %} {%- if not (message.role == 'ipython' " + "or message.role == 'tool' or 'tool_calls' in message) %} {{- '<|start_header_id|>' + message['role']" + " + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }} {%- elif 'tool_calls' in " + 'message %} {%- if not message.tool_calls|length == 1 %} {{- raise_exception("This model only ' + 'supports single tool-calls at once!") }} {%- endif %} {%- set tool_call = message.tool_calls[0]' + ".function %} {%- if builtin_tools is defined and tool_call.name in builtin_tools %} {{- " + "'<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}} {{- \"<|python_tag|>\" + tool_call.name + " + '".call(" }} {%- for arg_name, arg_val in tool_call.arguments | items %} {{- arg_name + \'="\' + ' + 'arg_val + \'"\' }} {%- if not loop.last %} {{- ", " }} {%- endif %} {%- endfor %} {{- ")" }} {%- ' + "else %} {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}} {{- '{\"name\": \"' + " + 'tool_call.name + \'", \' }} {{- \'"parameters": \' }} {{- tool_call.arguments | tojson }} {{- "}" ' + "}} {%- endif %} {%- if builtin_tools is defined %} {#- This means we're in ipython mode #} {{- " + '"<|eom_id|>" }} {%- else %} {{- "<|eot_id|>" }} {%- endif %} {%- elif message.role == "tool" ' + 'or message.role == "ipython" %} {{- "<|start_header_id|>ipython<|end_header_id|>\\n\\n" }} {%- ' + "if message.content is mapping or message.content is iterable %} {{- message.content | tojson }} {%- " + 'else %} {{- message.content }} {%- endif %} {{- "<|eot_id|>" }} {%- endif %} {%- endfor %} {%- if ' + "add_generation_prompt %} {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }} {%- endif %} " + ) + + def runTest(self): + prompt_assembler = ( + PromptAssembler() + .setInputCol("messages") + .setOutputCol("prompt") + .setChatTemplate(self.template) + ) + + expectedNoAss = ( + "<|start_header_id|>system<|end_header_id|>\n" + + "\n" + + "You are a helpful assistant.<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n" + + "\n" + + "Hello there, how can I help you?<|eot_id|><|start_header_id|>user<|end_header_id|>\n" + + "\n" + + "I need help with organizing my room.<|eot_id|>" + ) + + assistantHeader = "<|start_header_id|>assistant<|end_header_id|>\n\n" + + results = prompt_assembler.transform(self.df).select("prompt.result").collect() + + assert results[0].result[0] == expectedNoAss + assistantHeader + + results_noass = ( + prompt_assembler.setAddAssistant(False) + .transform(self.df) + .select("prompt.result") + .collect() + ) + + assert results_noass[0].result[0] == expectedNoAss diff --git a/src/main/scala/com/johnsnowlabs/nlp/PromptAssembler.scala b/src/main/scala/com/johnsnowlabs/nlp/PromptAssembler.scala new file mode 100644 index 00000000000000..3a58132965071d --- /dev/null +++ b/src/main/scala/com/johnsnowlabs/nlp/PromptAssembler.scala @@ -0,0 +1,250 @@ +package com.johnsnowlabs.nlp + +import com.johnsnowlabs.nlp.AnnotatorType.DOCUMENT +import com.johnsnowlabs.nlp.llama.LlamaModel +import org.apache.spark.ml.Transformer +import org.apache.spark.ml.param.{BooleanParam, Param, ParamMap} +import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable} +import org.apache.spark.sql.expressions.UserDefinedFunction +import org.apache.spark.sql.functions.udf +import org.apache.spark.sql.types._ +import org.apache.spark.sql.{Column, DataFrame, Dataset} +import org.apache.spark.sql.types.StructType + +/** Assembles a sequence of messages into a single string using a template. These strings can then + * be used as prompts for large language models. + * + * This annotator expects an array of two-tuples as the type of the input column (one array of + * tuples per row). The first element of the tuples should be the role and the second element is + * the text of the message. Possible roles are "system", "user" and "assistant". + * + * An assistant header can be added to the end of the generated string by using + * `setAddAssistant(true)`. + * + * At the moment, this annotator uses llama.cpp as a backend to parse and apply the templates. + * llama.cpp uses basic pattern matching to determine the type of the template, then applies a + * basic version of the template to the messages. This means that more advanced templates are not + * supported. + * + * For an extended example see the + * [[https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/llama.cpp/PromptAssember_with_AutoGGUFModel.ipynb example notebook]]. + * + * ==Example== + * {{{ + * // Batches (whole conversations) of arrays of messages + * val data: Seq[Seq[(String, String)]] = Seq( + * Seq( + * ("system", "You are a helpful assistant."), + * ("assistant", "Hello there, how can I help you?"), + * ("user", "I need help with organizing my room."))) + * + * val dataDF = data.toDF("messages") + * + * // llama3.1 + * val template = + * "{{- bos_token }} {%- if custom_tools is defined %} {%- set tools = custom_tools %} {%- " + + * "endif %} {%- if not tools_in_user_message is defined %} {%- set tools_in_user_message = true %} {%- " + + * "endif %} {%- if not date_string is defined %} {%- set date_string = \"26 Jul 2024\" %} {%- endif %} " + + * "{%- if not tools is defined %} {%- set tools = none %} {%- endif %} {#- This block extracts the " + + * "system message, so we can slot it into the right place. #} {%- if messages[0]['role'] == 'system' %}" + + * " {%- set system_message = messages[0]['content']|trim %} {%- set messages = messages[1:] %} {%- else" + + * " %} {%- set system_message = \"\" %} {%- endif %} {#- System message + builtin tools #} {{- " + + * "\"<|start_header_id|>system<|end_header_id|>\\n\\n\" }} {%- if builtin_tools is defined or tools is " + + * "not none %} {{- \"Environment: ipython\\n\" }} {%- endif %} {%- if builtin_tools is defined %} {{- " + + * "\"Tools: \" + builtin_tools | reject('equalto', 'code_interpreter') | join(\", \") + \"\\n\\n\"}} " + + * "{%- endif %} {{- \"Cutting Knowledge Date: December 2023\\n\" }} {{- \"Today Date: \" + date_string " + + * "+ \"\\n\\n\" }} {%- if tools is not none and not tools_in_user_message %} {{- \"You have access to " + + * "the following functions. To call a function, please respond with JSON for a function call.\" }} {{- " + + * "'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its" + + * " value}.' }} {{- \"Do not use variables.\\n\\n\" }} {%- for t in tools %} {{- t | tojson(indent=4) " + + * "}} {{- \"\\n\\n\" }} {%- endfor %} {%- endif %} {{- system_message }} {{- \"<|eot_id|>\" }} {#- " + + * "Custom tools are passed in a user message with some extra guidance #} {%- if tools_in_user_message " + + * "and not tools is none %} {#- Extract the first user message so we can plug it in here #} {%- if " + + * "messages | length != 0 %} {%- set first_user_message = messages[0]['content']|trim %} {%- set " + + * "messages = messages[1:] %} {%- else %} {{- raise_exception(\"Cannot put tools in the first user " + + * "message when there's no first user message!\") }} {%- endif %} {{- " + + * "'<|start_header_id|>user<|end_header_id|>\\n\\n' -}} {{- \"Given the following functions, please " + + * "respond with a JSON for a function call \" }} {{- \"with its proper arguments that best answers the " + + * "given prompt.\\n\\n\" }} {{- 'Respond in the format {\"name\": function name, \"parameters\": " + + * "dictionary of argument name and its value}.' }} {{- \"Do not use variables.\\n\\n\" }} {%- for t in " + + * "tools %} {{- t | tojson(indent=4) }} {{- \"\\n\\n\" }} {%- endfor %} {{- first_user_message + " + + * "\"<|eot_id|>\"}} {%- endif %} {%- for message in messages %} {%- if not (message.role == 'ipython' " + + * "or message.role == 'tool' or 'tool_calls' in message) %} {{- '<|start_header_id|>' + message['role']" + + * " + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }} {%- elif 'tool_calls' in " + + * "message %} {%- if not message.tool_calls|length == 1 %} {{- raise_exception(\"This model only " + + * "supports single tool-calls at once!\") }} {%- endif %} {%- set tool_call = message.tool_calls[0]" + + * ".function %} {%- if builtin_tools is defined and tool_call.name in builtin_tools %} {{- " + + * "'<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}} {{- \"<|python_tag|>\" + tool_call.name + " + + * "\".call(\" }} {%- for arg_name, arg_val in tool_call.arguments | items %} {{- arg_name + '=\"' + " + + * "arg_val + '\"' }} {%- if not loop.last %} {{- \", \" }} {%- endif %} {%- endfor %} {{- \")\" }} {%- " + + * "else %} {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}} {{- '{\"name\": \"' + " + + * "tool_call.name + '\", ' }} {{- '\"parameters\": ' }} {{- tool_call.arguments | tojson }} {{- \"}\" " + + * "}} {%- endif %} {%- if builtin_tools is defined %} {#- This means we're in ipython mode #} {{- " + + * "\"<|eom_id|>\" }} {%- else %} {{- \"<|eot_id|>\" }} {%- endif %} {%- elif message.role == \"tool\" " + + * "or message.role == \"ipython\" %} {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }} {%- " + + * "if message.content is mapping or message.content is iterable %} {{- message.content | tojson }} {%- " + + * "else %} {{- message.content }} {%- endif %} {{- \"<|eot_id|>\" }} {%- endif %} {%- endfor %} {%- if " + + * "add_generation_prompt %} {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }} {%- endif %} " + * + * val promptAssembler = new PromptAssembler() + * .setInputCol("messages") + * .setOutputCol("prompt") + * .setChatTemplate(template) + * + * promptAssembler.transform(dataDF).select("prompt.result").show(truncate = false) + * +----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ + * |result | + * +----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ + * |[<|start_header_id|>system<|end_header_id|>\n\nYou are a helpful assistant.<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nHello there, how can I help you?<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nI need help with organizing my room.<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n]| + * +----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ + * + * }}} + * + * @param uid + * required uid for storing annotator to disk + * @groupname anno Annotator types + * @groupdesc anno + * Required input and expected output annotator types + * @groupname Ungrouped Members + * @groupname param Parameters + * @groupname setParam Parameter setters + * @groupname getParam Parameter getters + * @groupname Ungrouped Members + * @groupprio param 1 + * @groupprio anno 2 + * @groupprio Ungrouped 3 + * @groupprio setParam 4 + * @groupprio getParam 5 + * @groupdesc param + * A list of (hyper-)parameter keys this annotator can take. Users can set and get the + * parameter values through setters and getters, respectively. + */ +class PromptAssembler(override val uid: String) + extends Transformer + with DefaultParamsWritable + with HasOutputAnnotatorType + with HasOutputAnnotationCol { + override val outputAnnotatorType: AnnotatorType = DOCUMENT + + def this() = this(Identifiable.randomUID("PROMPT_ASSEMBLER")) + + val chatTemplate: Param[String] = + new Param[String](this, "chatTemplate", "Template used for the chat") + + val inputCol: Param[String] = + new Param[String](this, "inputCol", "Input column containing a sequence of messages") + + val addAssistant: BooleanParam = + new BooleanParam( + this, + "addAssistant", + "Whether to add an assistant header to the end of the generated string") + + setDefault(addAssistant -> true) + + /** Sets the input text column for processing + * + * @group setParam + */ + def setInputCol(value: String): this.type = set(inputCol, value) + def getInputCol: String = $(inputCol) + + /** Sets the chat template to be used for the chat. Should be something that llama.cpp can + * parse. + * + * @param value + * The template to use + */ + def setChatTemplate(value: String): this.type = set(chatTemplate, value) + + /** Gets the chat template to be used for the chat. + * + * @return + * The template to use + */ + def getChatTemplate: String = $(chatTemplate) + + /** Whether to add an assistant header to the end of the generated string. + * + * @param value + * Whether to add the assistant header + */ + def setAddAssistant(value: Boolean): this.type = set(addAssistant, value) + + /** Whether to add an assistant header to the end of the generated string. + * + * @return + * Whether to add the assistant header + */ + def getAddAssistant: Boolean = $(addAssistant) + + // Expected Input type of the input column + private val expectedInputType = ArrayType( + StructType( + Seq( + StructField("_1", StringType, nullable = true), + StructField("_2", StringType, nullable = true))), + containsNull = true) + + /** Adds the result Annotation type to the schema. + * + * Requirement for pipeline transformation validation. It is called on fit() + */ + override final def transformSchema(schema: StructType): StructType = { + val metadataBuilder: MetadataBuilder = new MetadataBuilder() + metadataBuilder.putString("annotatorType", outputAnnotatorType) + val outputFields = schema.fields :+ + StructField( + getOutputCol, + ArrayType(Annotation.dataType), + nullable = false, + metadataBuilder.build) + StructType(outputFields) + } + + override def transform(dataset: Dataset[_]): DataFrame = { + val metadataBuilder: MetadataBuilder = new MetadataBuilder() + metadataBuilder.putString("annotatorType", outputAnnotatorType) + val columnDataType = dataset.schema.fields + .find(_.name == getInputCol) + .getOrElse( + throw new IllegalArgumentException(s"Dataset does not have any '$getInputCol' column")) + .dataType + + val documentAnnotations: Column = + if (columnDataType == expectedInputType) applyTemplate(dataset.col(getInputCol)) + else + throw new IllegalArgumentException( + s"Column '$getInputCol' must be of type Array[(String, String)] " + + s"(exactly '$expectedInputType'), but got $columnDataType") + + dataset.withColumn(getOutputCol, documentAnnotations.as(getOutputCol, metadataBuilder.build)) + } + + private def applyTemplate: UserDefinedFunction = udf { chat: Seq[(String, String)] => + try { + val template = $(chatTemplate) + + val chatArray = chat.map { case (role, text) => + Array(role, text) + }.toArray + + val chatString = LlamaModel.applyChatTemplate(template, chatArray, $(addAssistant)) + Seq(Annotation(chatString)) + } catch { + case _: Exception => + /* + * when there is a null in the row + * it outputs an empty Annotation + * */ + Seq.empty + } + } + + override def copy(extra: ParamMap): Transformer = defaultCopy(extra) +} + +/** This is the companion object of [[PromptAssembler]]. Please refer to that class for the + * documentation. + */ +object PromptAssembler extends DefaultParamsReadable[PromptAssembler] diff --git a/src/main/scala/com/johnsnowlabs/nlp/pretrained/ResourceDownloader.scala b/src/main/scala/com/johnsnowlabs/nlp/pretrained/ResourceDownloader.scala index f271566e04715a..145bcc67f26b35 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/pretrained/ResourceDownloader.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/pretrained/ResourceDownloader.scala @@ -40,7 +40,7 @@ import com.johnsnowlabs.nlp.annotators.ws.WordSegmenterModel import com.johnsnowlabs.nlp.embeddings._ import com.johnsnowlabs.nlp.pretrained.ResourceType.ResourceType import com.johnsnowlabs.nlp.util.io.{OutputHelper, ResourceHelper} -import com.johnsnowlabs.nlp.{DocumentAssembler, TableAssembler, pretrained} +import com.johnsnowlabs.nlp.{DocumentAssembler, PromptAssembler, TableAssembler, pretrained} import com.johnsnowlabs.util._ import org.apache.hadoop.fs.FileSystem import org.apache.spark.ml.util.DefaultParamsReadable @@ -689,8 +689,8 @@ object PythonResourceDownloader { "MxbaiEmbeddings" -> MxbaiEmbeddings, "SnowFlakeEmbeddings" -> SnowFlakeEmbeddings, "CamemBertForZeroShotClassification" -> CamemBertForZeroShotClassification, - "BertForMultipleChoice" -> BertForMultipleChoice - ) + "BertForMultipleChoice" -> BertForMultipleChoice, + "PromptAssembler" -> PromptAssembler) // List pairs of types such as the one with key type can load a pretrained model from the value type val typeMapper: Map[String, String] = Map("ZeroShotNerModel" -> "RoBertaForQuestionAnswering") diff --git a/src/test/scala/com/johnsnowlabs/nlp/PromptAssemblerTestSpec.scala b/src/test/scala/com/johnsnowlabs/nlp/PromptAssemblerTestSpec.scala new file mode 100644 index 00000000000000..c65040062def5a --- /dev/null +++ b/src/test/scala/com/johnsnowlabs/nlp/PromptAssemblerTestSpec.scala @@ -0,0 +1,92 @@ +package com.johnsnowlabs.nlp + +import com.johnsnowlabs.nlp.util.io.ResourceHelper +import com.johnsnowlabs.tags.SlowTest +import org.scalatest.flatspec.AnyFlatSpec + +class PromptAssemblerTestSpec extends AnyFlatSpec { + import ResourceHelper.spark.implicits._ + + behavior of "PromptAssembler" + + it should "create some prompts" taggedAs SlowTest in { + // Batches (whole conversations) of arrays of messages + val data: Seq[Seq[(String, String)]] = Seq( + Seq( + ("system", "You are a helpful assistant."), + ("assistant", "Hello there, how can I help you?"), + ("user", "I need help with organizing my room."))) + + val dataDF = data.toDF("messages") + + val template = + "{{- bos_token }} {%- if custom_tools is defined %} {%- set tools = custom_tools %} {%- " + + "endif %} {%- if not tools_in_user_message is defined %} {%- set tools_in_user_message = true %} {%- " + + "endif %} {%- if not date_string is defined %} {%- set date_string = \"26 Jul 2024\" %} {%- endif %} " + + "{%- if not tools is defined %} {%- set tools = none %} {%- endif %} {#- This block extracts the " + + "system message, so we can slot it into the right place. #} {%- if messages[0]['role'] == 'system' %}" + + " {%- set system_message = messages[0]['content']|trim %} {%- set messages = messages[1:] %} {%- else" + + " %} {%- set system_message = \"\" %} {%- endif %} {#- System message + builtin tools #} {{- " + + "\"<|start_header_id|>system<|end_header_id|>\\n\\n\" }} {%- if builtin_tools is defined or tools is " + + "not none %} {{- \"Environment: ipython\\n\" }} {%- endif %} {%- if builtin_tools is defined %} {{- " + + "\"Tools: \" + builtin_tools | reject('equalto', 'code_interpreter') | join(\", \") + \"\\n\\n\"}} " + + "{%- endif %} {{- \"Cutting Knowledge Date: December 2023\\n\" }} {{- \"Today Date: \" + date_string " + + "+ \"\\n\\n\" }} {%- if tools is not none and not tools_in_user_message %} {{- \"You have access to " + + "the following functions. To call a function, please respond with JSON for a function call.\" }} {{- " + + "'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its" + + " value}.' }} {{- \"Do not use variables.\\n\\n\" }} {%- for t in tools %} {{- t | tojson(indent=4) " + + "}} {{- \"\\n\\n\" }} {%- endfor %} {%- endif %} {{- system_message }} {{- \"<|eot_id|>\" }} {#- " + + "Custom tools are passed in a user message with some extra guidance #} {%- if tools_in_user_message " + + "and not tools is none %} {#- Extract the first user message so we can plug it in here #} {%- if " + + "messages | length != 0 %} {%- set first_user_message = messages[0]['content']|trim %} {%- set " + + "messages = messages[1:] %} {%- else %} {{- raise_exception(\"Cannot put tools in the first user " + + "message when there's no first user message!\") }} {%- endif %} {{- " + + "'<|start_header_id|>user<|end_header_id|>\\n\\n' -}} {{- \"Given the following functions, please " + + "respond with a JSON for a function call \" }} {{- \"with its proper arguments that best answers the " + + "given prompt.\\n\\n\" }} {{- 'Respond in the format {\"name\": function name, \"parameters\": " + + "dictionary of argument name and its value}.' }} {{- \"Do not use variables.\\n\\n\" }} {%- for t in " + + "tools %} {{- t | tojson(indent=4) }} {{- \"\\n\\n\" }} {%- endfor %} {{- first_user_message + " + + "\"<|eot_id|>\"}} {%- endif %} {%- for message in messages %} {%- if not (message.role == 'ipython' " + + "or message.role == 'tool' or 'tool_calls' in message) %} {{- '<|start_header_id|>' + message['role']" + + " + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }} {%- elif 'tool_calls' in " + + "message %} {%- if not message.tool_calls|length == 1 %} {{- raise_exception(\"This model only " + + "supports single tool-calls at once!\") }} {%- endif %} {%- set tool_call = message.tool_calls[0]" + + ".function %} {%- if builtin_tools is defined and tool_call.name in builtin_tools %} {{- " + + "'<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}} {{- \"<|python_tag|>\" + tool_call.name + " + + "\".call(\" }} {%- for arg_name, arg_val in tool_call.arguments | items %} {{- arg_name + '=\"' + " + + "arg_val + '\"' }} {%- if not loop.last %} {{- \", \" }} {%- endif %} {%- endfor %} {{- \")\" }} {%- " + + "else %} {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}} {{- '{\"name\": \"' + " + + "tool_call.name + '\", ' }} {{- '\"parameters\": ' }} {{- tool_call.arguments | tojson }} {{- \"}\" " + + "}} {%- endif %} {%- if builtin_tools is defined %} {#- This means we're in ipython mode #} {{- " + + "\"<|eom_id|>\" }} {%- else %} {{- \"<|eot_id|>\" }} {%- endif %} {%- elif message.role == \"tool\" " + + "or message.role == \"ipython\" %} {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }} {%- " + + "if message.content is mapping or message.content is iterable %} {{- message.content | tojson }} {%- " + + "else %} {{- message.content }} {%- endif %} {{- \"<|eot_id|>\" }} {%- endif %} {%- endfor %} {%- if " + + "add_generation_prompt %} {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }} {%- endif %} " + + val promptAssembler = new PromptAssembler() + .setInputCol("messages") + .setOutputCol("prompt") + .setChatTemplate(template) + + val results = promptAssembler.transform(dataDF) + + val expectedOutputNoAss = + "<|start_header_id|>system<|end_header_id|>\n" + "\n" + "You are a helpful assistant.<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n" + "\n" + "Hello there, how can I help you?<|eot_id|><|start_header_id|>user<|end_header_id|>\n" + "\n" + "I need help with organizing my room.<|eot_id|>" + + val assistantHeader = "<|start_header_id|>assistant<|end_header_id|>\n\n" + + Annotation.collect(results, "prompt").foreach { annotations => + val prompt = annotations.map(_.result).head + assert(prompt === expectedOutputNoAss + assistantHeader) + } + + val resultsNoAss = promptAssembler.setAddAssistant(false).transform(dataDF) + + Annotation.collect(resultsNoAss, "prompt").foreach { annotations => + val prompt = annotations.map(_.result).head + assert(prompt === expectedOutputNoAss) + } + } + +} From 126fb030f0e4b06a90c0d58fbfd36597531e8877 Mon Sep 17 00:00:00 2001 From: Danilo Burbano Date: Mon, 21 Oct 2024 16:24:40 -0500 Subject: [PATCH 13/24] [SPARKNLP-1084] Adding notebooks for BertForMultipleChoice --- ...X_in_Spark_NLP_BertForMultipleChoice.ipynb | 2843 ++++++++++++++++ ...O_in_Spark_NLP_BertForMultipleChoice.ipynb | 2897 +++++++++++++++++ 2 files changed, 5740 insertions(+) create mode 100644 examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_BertForMultipleChoice.ipynb create mode 100644 examples/python/transformers/openvino/HuggingFace_OpenVINO_in_Spark_NLP_BertForMultipleChoice.ipynb diff --git a/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_BertForMultipleChoice.ipynb b/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_BertForMultipleChoice.ipynb new file mode 100644 index 00000000000000..7503cfd9f8b000 --- /dev/null +++ b/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_BertForMultipleChoice.ipynb @@ -0,0 +1,2843 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "PAsu8UVGoLVf" + }, + "source": [ + "![JohnSnowLabs](https://sparknlp.org/assets/images/logo.png)\n", + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_BertForMultipleChoice.ipynb)\n", + "\n", + "## Import ONNX BertForMultipleChoice models from HuggingFace ๐Ÿค— into Spark NLP ๐Ÿš€\n", + "\n", + "Let's keep in mind a few things before we start ๐Ÿ˜Š\n", + "\n", + "- ONNX support was introduced in `Spark NLP 5.0.0`, enabling high performance inference for models.\n", + "- `BertForMultipleChoice` is only available since in `Spark NLP 5.5.1` and after. So please make sure you have upgraded to the latest Spark NLP release\n", + "- You can import BERT models trained/fine-tuned for question answering via `BertForMultipleChoice` or `TFBertForMultipleChoice`. These models are usually under `Multiple Choice` category and have `bert` in their labels\n", + "- Reference: [BertForMultipleChoice](https://huggingface.co/docs/transformers/main/en/model_doc/bert#transformers.BertForMultipleChoice)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "OzijcdtQpOx9" + }, + "source": [ + "## Export and Save HuggingFace model" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "MlgoClMXpSg4" + }, + "source": [ + "- Let's install `transformers` package with the `onnx` extension and it's dependencies. You don't need `onnx` to be installed for Spark NLP, however, we need it to load and save models from HuggingFace." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "cJWbob-kHICU", + "outputId": "634cc746-4b76-4b99-b9ea-ff72f7e865b6" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[2K \u001b[90mโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”\u001b[0m \u001b[32m44.4/44.4 kB\u001b[0m \u001b[31m4.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90mโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”\u001b[0m \u001b[32m422.6/422.6 kB\u001b[0m \u001b[31m29.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90mโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”\u001b[0m \u001b[32m13.2/13.2 MB\u001b[0m \u001b[31m111.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90mโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”\u001b[0m \u001b[32m212.7/212.7 kB\u001b[0m \u001b[31m20.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90mโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”\u001b[0m \u001b[32m3.0/3.0 MB\u001b[0m \u001b[31m94.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90mโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”\u001b[0m \u001b[32m9.9/9.9 MB\u001b[0m \u001b[31m118.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90mโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”\u001b[0m \u001b[32m46.0/46.0 kB\u001b[0m \u001b[31m4.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90mโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”\u001b[0m \u001b[32m471.6/471.6 kB\u001b[0m \u001b[31m36.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90mโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”\u001b[0m \u001b[32m84.5/84.5 kB\u001b[0m \u001b[31m8.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90mโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”\u001b[0m \u001b[32m1.1/1.1 MB\u001b[0m \u001b[31m60.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90mโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”\u001b[0m \u001b[32m455.8/455.8 kB\u001b[0m \u001b[31m39.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90mโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”\u001b[0m \u001b[32m116.3/116.3 kB\u001b[0m \u001b[31m11.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90mโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”\u001b[0m \u001b[32m86.8/86.8 kB\u001b[0m \u001b[31m8.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90mโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”\u001b[0m \u001b[32m16.0/16.0 MB\u001b[0m \u001b[31m102.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90mโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”\u001b[0m \u001b[32m134.8/134.8 kB\u001b[0m \u001b[31m13.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90mโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”\u001b[0m \u001b[32m55.5/55.5 kB\u001b[0m \u001b[31m5.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90mโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”\u001b[0m \u001b[32m194.1/194.1 kB\u001b[0m \u001b[31m16.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", + "tensorflow 2.17.0 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.20.3, but you have protobuf 3.20.2 which is incompatible.\n", + "tensorflow-metadata 1.16.1 requires protobuf<4.21,>=3.20.3; python_version < \"3.11\", but you have protobuf 3.20.2 which is incompatible.\u001b[0m\u001b[31m\n", + "\u001b[0m" + ] + } + ], + "source": [ + "!pip install -q --upgrade transformers[onnx] optimum" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "XtewR2xdOa5s" + }, + "source": [ + "- HuggingFace has an extension called Optimum which offers specialized model inference, including ONNX. We can use this to import and export ONNX models with `from_pretrained` and `save_pretrained`.\n", + "- We'll use the treained model above as an example and load it as a `ORTModelForMultipleChoice`, representing an ONNX model." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "avTe8Oe5N-vw", + "outputId": "270cf088-de9d-4dd2-d0cf-56daba62e141" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount(\"/content/drive\", force_remount=True).\n" + ] + } + ], + "source": [ + "from google.colab import drive\n", + "drive.mount('/content/drive')" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "id": "87VKKCh1N-Ut" + }, + "outputs": [], + "source": [ + "!pip install -q --upgrade transformers[onnx] optimum" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 313, + "referenced_widgets": [ + "536b66d0cccf4cf786748c8a6cd9a2e6", + "1130eb5f3c28457e91a0682afaf67069", + "9d04b383ca164c47858a0714a3787b0f", + "6bd22c6de12b4a1e85ac1d685d4c4b9d", + "56c28029750d4ed5825bc11743666216", + "44a6ba0b62c74caf9ac697a8b2508778", + "e9c242f006004fe7b0451aad4585bece", + "e9838254fa49415f9294b45b5b2e43dc", + "0fa29cab0fd9434999d96f1ad0bcba5a", + "63fc2a981df44df8b8bf973b27d676a2", + "6e713ef761d94db09ac9e8475db8b797", + "0ddde9b887664c33973e83300ec058dc", + "709708a88cfe4e309562309f917b28eb", + "d092115515934ef7b245d601ec76f0ff", + "895aef09577c4c4cbf1fbb4678d62d79", + "05e63f4743aa46399bba1f84dd5e6bdf", + "d362bde25e934f3c988d73b60c0c43a5", + "094a292084ef432691e6c18ce866c941", + "3ecc095f553f4aa2bec17ab89b0f9ffb", + "a45d7ec88de14af2b5d957d2488f2adb", + "e6c4c89ff3f442d5aeb529965cd20569", + "f53cb41ab1b24f26bdb00f668a5dd151", + "014b50ed188142929ace9a631c84272b", + "1f5cf58f0d3a49849118cfa915ef48ce", + "1392c7587f7d4949bcf397aa4c58dbb3", + "1cdfb59aa26847b8968fcd473a2be080", + "c26a6811d56d480493bd07187d83bb4e", + "9d36210ea042439fa416e56958ac07f0", + "473d11d9b392465498543c1ea419921e", + "edc0f80fbb06409387c1f5fc274a6dac", + "09916f2b1fb74c80a75c4f80cea37466", + "d42aab02a56b4d3595b472ba70d60140", + "7bab26cb7e6d4ed7bf937bfffc5cea44", + "faace0b7c16a457b85536935607b22d4", + "1fc9ff5ed52d4d9392a76d7e0b56dad9", + "42b56ec25f194b409b98cd8b329e88dc", + "02baf3c61b46482c837a2bfabd6147a0", + "7373e785482f42449b037db4f496e5bb", + "8ca5004746cd4eda9f9fa030ce30c40e", + "ed565b9626b649af8693010e62409eda", + "95668f7bc7c24486a5099fb165889472", + "14a6d11bbfeb40bba886c8c32f1ba6f3", + "7b1cb99f724340a188c415ee1e13d2d3", + "397e2db043d6472d99c74a379eabaa53", + "1dc01965dcc34e9cb6f56c10b6353d3e", + "58193dd681c448fca582d8f22d48beea", + "c08473e4c64a4e56913f51d03b0ae559", + "a01c40ccaa834d9c9ea4eee832167b91", + "e2220acbd46f40e985c2fd933bfd2911", + "d0a0dc4b27924784b5469934ce54e0c3", + "8e7055a1c81f42118eb7eeda5fcfce69", + "c93a0ef319914cbd8b6ba23ba5c4e072", + "83f74c615b1246feb25c2795bd9f6169", + "e206b3ee98874a778fdaf98415895eb3", + "7cd10b83ce46411986dd4cd21dabe6f3", + "ef00de2de7f44fb18171ffd25b37804e", + "062e75f35a3b401db7abdd4aa07ef56c", + "00f2b09d746741f0b33593688d90e256", + "19a5f60e1320433898ddcc49659b1239", + "8dc2d8522aef45e7a45b35e096ede730", + "cff2da919d7547d4ae865c3fb1dbdf3d", + "5b0426cdc92b450cb72195931afba570", + "af42ed0b65644f36b75b2d6f82fd7ddd", + "cab9e63c1c074789b3a4899e07eab141", + "17f3c052251246f9b503864c01097cd4", + "6a175dc342d54d5dbeffa52356e08b61" + ] + }, + "id": "Id33annImYM8", + "outputId": "ebbd9e75-7da7-4e7f-d715-916caf762dbf" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.10/dist-packages/huggingface_hub/utils/_token.py:89: UserWarning: \n", + "The secret `HF_TOKEN` does not exist in your Colab secrets.\n", + "To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.\n", + "You will be able to reuse this secret in all of your notebooks.\n", + "Please note that authentication is recommended but still optional to access public models or datasets.\n", + " warnings.warn(\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "536b66d0cccf4cf786748c8a6cd9a2e6", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "config.json: 0%| | 0.00/670 [00:00 0, chunk -> 0, score -> 0.50158674}, []}]|\n", + "|[{chunk, 0, 6, Germany, {sentence -> 0, chunk -> 0, score -> 0.34051788}, []}] |\n", + "|[{chunk, 0, 3, Lion, {sentence -> 0, chunk -> 0, score -> 0.2668043}, []}] |\n", + "|[{chunk, 0, 3, 90ยฐC, {sentence -> 0, chunk -> 0, score -> 0.3636225}, []}] |\n", + "|[{chunk, 0, 4, Mars, {sentence -> 0, chunk -> 0, score -> 0.37094638}, []}] |\n", + "|[{chunk, 0, 10, Portuguese, {sentence -> 0, chunk -> 0, score -> 0.38204}, []}] |\n", + "|[{chunk, 0, 11, The Mongols, {sentence -> 0, chunk -> 0, score -> 0.30194965}, []}] |\n", + "|[{chunk, 0, 6, Oxygenm, {sentence -> 0, chunk -> 0, score -> 0.41825965}, []}] |\n", + "|[{chunk, 0, 6, Africa, {sentence -> 0, chunk -> 0, score -> 0.36631182}, []}] |\n", + "|[{chunk, 0, 15, Vincent van Gogh, {sentence -> 0, chunk -> 0, score -> 0.3519279}, []}] |\n", + "+-----------------------------------------------------------------------------------------------------------+\n", + "\n" + ] + } + ], + "source": [ + "document_assembler = MultiDocumentAssembler() \\\n", + " .setInputCols([\"question\", \"choices\"]) \\\n", + " .setOutputCols([\"document_question\", \"document_choices\"])\n", + "\n", + "bert_for_multiple_choice = BertForMultipleChoice() \\\n", + " .load(\"./{}_spark_nlp_onnx\".format(MODEL_NAME)) \\\n", + " .setInputCols([\"document_question\", \"document_choices\"])\\\n", + " .setOutputCol(\"answer\") \\\n", + " .setBatchSize(4)\n", + "\n", + "pipeline = Pipeline(stages=[document_assembler, bert_for_multiple_choice])\n", + "pipeline_model = pipeline.fit(testing_df)\n", + "\n", + "pipeline_df = pipeline_model.transform(testing_df)\n", + "\n", + "pipeline_df.select(\"answer\").show(truncate=False)" + ] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "gpuType": "A100", + "machine_shape": "hm", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "00f2b09d746741f0b33593688d90e256": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_af42ed0b65644f36b75b2d6f82fd7ddd", + "max": 125, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_cab9e63c1c074789b3a4899e07eab141", + "value": 125 + } + }, + "014b50ed188142929ace9a631c84272b": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_1f5cf58f0d3a49849118cfa915ef48ce", + "IPY_MODEL_1392c7587f7d4949bcf397aa4c58dbb3", + "IPY_MODEL_1cdfb59aa26847b8968fcd473a2be080" + ], + "layout": "IPY_MODEL_c26a6811d56d480493bd07187d83bb4e" + } + }, + "02baf3c61b46482c837a2bfabd6147a0": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_7b1cb99f724340a188c415ee1e13d2d3", + "placeholder": "โ€‹", + "style": "IPY_MODEL_397e2db043d6472d99c74a379eabaa53", + "value": "โ€‡232k/232kโ€‡[00:00<00:00,โ€‡11.8MB/s]" + } + }, + "05e63f4743aa46399bba1f84dd5e6bdf": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "062e75f35a3b401db7abdd4aa07ef56c": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_cff2da919d7547d4ae865c3fb1dbdf3d", + "placeholder": "โ€‹", + "style": "IPY_MODEL_5b0426cdc92b450cb72195931afba570", + "value": "special_tokens_map.json:โ€‡100%" + } + }, + "094a292084ef432691e6c18ce866c941": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "09916f2b1fb74c80a75c4f80cea37466": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "0ddde9b887664c33973e83300ec058dc": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_709708a88cfe4e309562309f917b28eb", + "IPY_MODEL_d092115515934ef7b245d601ec76f0ff", + "IPY_MODEL_895aef09577c4c4cbf1fbb4678d62d79" + ], + "layout": "IPY_MODEL_05e63f4743aa46399bba1f84dd5e6bdf" + } + }, + "0fa29cab0fd9434999d96f1ad0bcba5a": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "1130eb5f3c28457e91a0682afaf67069": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_44a6ba0b62c74caf9ac697a8b2508778", + "placeholder": "โ€‹", + "style": "IPY_MODEL_e9c242f006004fe7b0451aad4585bece", + "value": "config.json:โ€‡100%" + } + }, + "1392c7587f7d4949bcf397aa4c58dbb3": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_edc0f80fbb06409387c1f5fc274a6dac", + "max": 314, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_09916f2b1fb74c80a75c4f80cea37466", + "value": 314 + } + }, + "14a6d11bbfeb40bba886c8c32f1ba6f3": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "17f3c052251246f9b503864c01097cd4": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "19a5f60e1320433898ddcc49659b1239": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_17f3c052251246f9b503864c01097cd4", + "placeholder": "โ€‹", + "style": "IPY_MODEL_6a175dc342d54d5dbeffa52356e08b61", + "value": "โ€‡125/125โ€‡[00:00<00:00,โ€‡10.4kB/s]" + } + }, + "1cdfb59aa26847b8968fcd473a2be080": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_d42aab02a56b4d3595b472ba70d60140", + "placeholder": "โ€‹", + "style": "IPY_MODEL_7bab26cb7e6d4ed7bf937bfffc5cea44", + "value": "โ€‡314/314โ€‡[00:00<00:00,โ€‡26.8kB/s]" + } + }, + "1dc01965dcc34e9cb6f56c10b6353d3e": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_58193dd681c448fca582d8f22d48beea", + "IPY_MODEL_c08473e4c64a4e56913f51d03b0ae559", + "IPY_MODEL_a01c40ccaa834d9c9ea4eee832167b91" + ], + "layout": "IPY_MODEL_e2220acbd46f40e985c2fd933bfd2911" + } + }, + "1f5cf58f0d3a49849118cfa915ef48ce": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_9d36210ea042439fa416e56958ac07f0", + "placeholder": "โ€‹", + "style": "IPY_MODEL_473d11d9b392465498543c1ea419921e", + "value": "tokenizer_config.json:โ€‡100%" + } + }, + "1fc9ff5ed52d4d9392a76d7e0b56dad9": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_8ca5004746cd4eda9f9fa030ce30c40e", + "placeholder": "โ€‹", + "style": "IPY_MODEL_ed565b9626b649af8693010e62409eda", + "value": "vocab.txt:โ€‡100%" + } + }, + "397e2db043d6472d99c74a379eabaa53": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "3ecc095f553f4aa2bec17ab89b0f9ffb": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "42b56ec25f194b409b98cd8b329e88dc": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_95668f7bc7c24486a5099fb165889472", + "max": 231508, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_14a6d11bbfeb40bba886c8c32f1ba6f3", + "value": 231508 + } + }, + "44a6ba0b62c74caf9ac697a8b2508778": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "473d11d9b392465498543c1ea419921e": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "536b66d0cccf4cf786748c8a6cd9a2e6": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_1130eb5f3c28457e91a0682afaf67069", + "IPY_MODEL_9d04b383ca164c47858a0714a3787b0f", + "IPY_MODEL_6bd22c6de12b4a1e85ac1d685d4c4b9d" + ], + "layout": "IPY_MODEL_56c28029750d4ed5825bc11743666216" + } + }, + "56c28029750d4ed5825bc11743666216": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "58193dd681c448fca582d8f22d48beea": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_d0a0dc4b27924784b5469934ce54e0c3", + "placeholder": "โ€‹", + "style": "IPY_MODEL_8e7055a1c81f42118eb7eeda5fcfce69", + "value": "tokenizer.json:โ€‡100%" + } + }, + "5b0426cdc92b450cb72195931afba570": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "63fc2a981df44df8b8bf973b27d676a2": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "6a175dc342d54d5dbeffa52356e08b61": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "6bd22c6de12b4a1e85ac1d685d4c4b9d": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_63fc2a981df44df8b8bf973b27d676a2", + "placeholder": "โ€‹", + "style": "IPY_MODEL_6e713ef761d94db09ac9e8475db8b797", + "value": "โ€‡670/670โ€‡[00:00<00:00,โ€‡51.3kB/s]" + } + }, + "6e713ef761d94db09ac9e8475db8b797": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "709708a88cfe4e309562309f917b28eb": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_d362bde25e934f3c988d73b60c0c43a5", + "placeholder": "โ€‹", + "style": "IPY_MODEL_094a292084ef432691e6c18ce866c941", + "value": "pytorch_model.bin:โ€‡100%" + } + }, + "7373e785482f42449b037db4f496e5bb": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "7b1cb99f724340a188c415ee1e13d2d3": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "7bab26cb7e6d4ed7bf937bfffc5cea44": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "7cd10b83ce46411986dd4cd21dabe6f3": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "83f74c615b1246feb25c2795bd9f6169": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "895aef09577c4c4cbf1fbb4678d62d79": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_e6c4c89ff3f442d5aeb529965cd20569", + "placeholder": "โ€‹", + "style": "IPY_MODEL_f53cb41ab1b24f26bdb00f668a5dd151", + "value": "โ€‡438M/438Mโ€‡[00:17<00:00,โ€‡23.5MB/s]" + } + }, + "8ca5004746cd4eda9f9fa030ce30c40e": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "8dc2d8522aef45e7a45b35e096ede730": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "8e7055a1c81f42118eb7eeda5fcfce69": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "95668f7bc7c24486a5099fb165889472": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "9d04b383ca164c47858a0714a3787b0f": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_e9838254fa49415f9294b45b5b2e43dc", + "max": 670, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_0fa29cab0fd9434999d96f1ad0bcba5a", + "value": 670 + } + }, + "9d36210ea042439fa416e56958ac07f0": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "a01c40ccaa834d9c9ea4eee832167b91": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_e206b3ee98874a778fdaf98415895eb3", + "placeholder": "โ€‹", + "style": "IPY_MODEL_7cd10b83ce46411986dd4cd21dabe6f3", + "value": "โ€‡711k/711kโ€‡[00:00<00:00,โ€‡1.63MB/s]" + } + }, + "a45d7ec88de14af2b5d957d2488f2adb": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "af42ed0b65644f36b75b2d6f82fd7ddd": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "c08473e4c64a4e56913f51d03b0ae559": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_c93a0ef319914cbd8b6ba23ba5c4e072", + "max": 711494, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_83f74c615b1246feb25c2795bd9f6169", + "value": 711494 + } + }, + "c26a6811d56d480493bd07187d83bb4e": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "c93a0ef319914cbd8b6ba23ba5c4e072": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "cab9e63c1c074789b3a4899e07eab141": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "cff2da919d7547d4ae865c3fb1dbdf3d": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "d092115515934ef7b245d601ec76f0ff": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_3ecc095f553f4aa2bec17ab89b0f9ffb", + "max": 438000433, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_a45d7ec88de14af2b5d957d2488f2adb", + "value": 438000433 + } + }, + "d0a0dc4b27924784b5469934ce54e0c3": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "d362bde25e934f3c988d73b60c0c43a5": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "d42aab02a56b4d3595b472ba70d60140": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "e206b3ee98874a778fdaf98415895eb3": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "e2220acbd46f40e985c2fd933bfd2911": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "e6c4c89ff3f442d5aeb529965cd20569": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "e9838254fa49415f9294b45b5b2e43dc": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "e9c242f006004fe7b0451aad4585bece": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "ed565b9626b649af8693010e62409eda": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "edc0f80fbb06409387c1f5fc274a6dac": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "ef00de2de7f44fb18171ffd25b37804e": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_062e75f35a3b401db7abdd4aa07ef56c", + "IPY_MODEL_00f2b09d746741f0b33593688d90e256", + "IPY_MODEL_19a5f60e1320433898ddcc49659b1239" + ], + "layout": "IPY_MODEL_8dc2d8522aef45e7a45b35e096ede730" + } + }, + "f53cb41ab1b24f26bdb00f668a5dd151": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "faace0b7c16a457b85536935607b22d4": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_1fc9ff5ed52d4d9392a76d7e0b56dad9", + "IPY_MODEL_42b56ec25f194b409b98cd8b329e88dc", + "IPY_MODEL_02baf3c61b46482c837a2bfabd6147a0" + ], + "layout": "IPY_MODEL_7373e785482f42449b037db4f496e5bb" + } + } + } + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/examples/python/transformers/openvino/HuggingFace_OpenVINO_in_Spark_NLP_BertForMultipleChoice.ipynb b/examples/python/transformers/openvino/HuggingFace_OpenVINO_in_Spark_NLP_BertForMultipleChoice.ipynb new file mode 100644 index 00000000000000..51488e3f466152 --- /dev/null +++ b/examples/python/transformers/openvino/HuggingFace_OpenVINO_in_Spark_NLP_BertForMultipleChoice.ipynb @@ -0,0 +1,2897 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "_V5XcDCnVgSi" + }, + "source": [ + "![JohnSnowLabs](https://sparknlp.org/assets/images/logo.png)\n", + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/openvino/HuggingFace_OpenVINO_in_Spark_NLP_BertForMultipleChoice.ipynb)\n", + "\n", + "# Import OpenVINO BertForMultipleChoice models from HuggingFace ๐Ÿค— into Spark NLP ๐Ÿš€\n", + "\n", + "This notebook provides a detailed walkthrough on optimizing and exporting BertForMultipleChoice models from HuggingFace for use in Spark NLP, leveraging the various tools provided in the [Intel OpenVINO toolkit](https://www.intel.com/content/www/us/en/developer/tools/openvino-toolkit/overview.html) ecosystem.\n", + "\n", + "Let's keep in mind a few things before we start ๐Ÿ˜Š\n", + "\n", + "- OpenVINO support was introduced in `Spark NLP 5.4.0`, enabling high performance inference for models. Please make sure you have upgraded to the latest Spark NLP release.\n", + "- You can import models for BertForMultipleChoice from BertForMultipleChoice and they have to be in `For Multiple Choice` category." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "aghasVppVgSk" + }, + "source": [ + "## 1. Export and Save the HuggingFace model" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "be4HsTDMVgSk" + }, + "source": [ + "- Let's install `transformers` and `openvino` packages with other dependencies. You don't need `openvino` to be installed for Spark NLP, however, we need it to load and save models from HuggingFace.\n", + "- We lock `transformers` on version `4.41.2`. This doesn't mean it won't work with the future releases, but we wanted you to know which versions have been tested successfully." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "-7L-2ZWUVgSl", + "outputId": "acfa49b6-5b43-4465-e955-6b44233bf0de" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[2K \u001b[90mโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”\u001b[0m \u001b[32m43.8/43.8 kB\u001b[0m \u001b[31m1.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90mโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”\u001b[0m \u001b[32m9.1/9.1 MB\u001b[0m \u001b[31m67.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90mโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”\u001b[0m \u001b[32m38.7/38.7 MB\u001b[0m \u001b[31m31.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90mโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”\u001b[0m \u001b[32m215.7/215.7 kB\u001b[0m \u001b[31m7.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90mโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”\u001b[0m \u001b[32m471.6/471.6 kB\u001b[0m \u001b[31m23.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90mโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”\u001b[0m \u001b[32m422.6/422.6 kB\u001b[0m \u001b[31m37.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90mโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”\u001b[0m \u001b[32m16.0/16.0 MB\u001b[0m \u001b[31m107.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90mโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”\u001b[0m \u001b[32m116.3/116.3 kB\u001b[0m \u001b[31m12.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90mโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”\u001b[0m \u001b[32m46.0/46.0 kB\u001b[0m \u001b[31m4.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90mโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”\u001b[0m \u001b[32m134.8/134.8 kB\u001b[0m \u001b[31m14.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90mโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”\u001b[0m \u001b[32m194.1/194.1 kB\u001b[0m \u001b[31m18.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90mโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”\u001b[0m \u001b[32m86.8/86.8 kB\u001b[0m \u001b[31m9.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90mโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”\u001b[0m \u001b[32m13.1/13.1 MB\u001b[0m \u001b[31m49.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90mโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”\u001b[0m \u001b[32m1.1/1.1 MB\u001b[0m \u001b[31m62.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", + "google-ai-generativelanguage 0.6.6 requires protobuf!=3.20.0,!=3.20.1,!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.19.5, but you have protobuf 3.20.1 which is incompatible.\n", + "google-api-core 2.19.2 requires protobuf!=3.20.0,!=3.20.1,!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0.dev0,>=3.19.5, but you have protobuf 3.20.1 which is incompatible.\n", + "google-cloud-aiplatform 1.70.0 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0dev,>=3.20.2, but you have protobuf 3.20.1 which is incompatible.\n", + "google-cloud-bigquery-connection 1.15.5 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0dev,>=3.20.2, but you have protobuf 3.20.1 which is incompatible.\n", + "google-cloud-bigquery-storage 2.27.0 requires protobuf!=3.20.0,!=3.20.1,!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0dev,>=3.20.2, but you have protobuf 3.20.1 which is incompatible.\n", + "google-cloud-bigtable 2.26.0 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0dev,>=3.20.2, but you have protobuf 3.20.1 which is incompatible.\n", + "google-cloud-datastore 2.19.0 requires protobuf!=3.20.0,!=3.20.1,!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.19.5, but you have protobuf 3.20.1 which is incompatible.\n", + "google-cloud-firestore 2.16.1 requires protobuf!=3.20.0,!=3.20.1,!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.19.5, but you have protobuf 3.20.1 which is incompatible.\n", + "google-cloud-functions 1.16.5 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0dev,>=3.20.2, but you have protobuf 3.20.1 which is incompatible.\n", + "google-cloud-iam 2.15.2 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0dev,>=3.20.2, but you have protobuf 3.20.1 which is incompatible.\n", + "google-cloud-language 2.13.4 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0dev,>=3.20.2, but you have protobuf 3.20.1 which is incompatible.\n", + "google-cloud-pubsub 2.25.0 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0dev,>=3.20.2, but you have protobuf 3.20.1 which is incompatible.\n", + "google-cloud-resource-manager 1.12.5 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0dev,>=3.20.2, but you have protobuf 3.20.1 which is incompatible.\n", + "google-cloud-translate 3.15.5 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0dev,>=3.20.2, but you have protobuf 3.20.1 which is incompatible.\n", + "googleapis-common-protos 1.65.0 requires protobuf!=3.20.0,!=3.20.1,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0.dev0,>=3.20.2, but you have protobuf 3.20.1 which is incompatible.\n", + "grpc-google-iam-v1 0.13.1 requires protobuf!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0dev,>=3.20.2, but you have protobuf 3.20.1 which is incompatible.\n", + "tensorflow 2.17.0 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.20.3, but you have protobuf 3.20.1 which is incompatible.\n", + "tensorflow-metadata 1.16.1 requires protobuf<4.21,>=3.20.3; python_version < \"3.11\", but you have protobuf 3.20.1 which is incompatible.\u001b[0m\u001b[31m\n", + "\u001b[0m" + ] + } + ], + "source": [ + "!pip install -q --upgrade transformers==4.41.2\n", + "!pip install -q --upgrade openvino==2024.1\n", + "!pip install -q --upgrade optimum-intel==1.17.0\n", + "!pip install -q --upgrade onnx==1.12.0" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "vI7uz_6hVgSl" + }, + "source": [ + "[Optimum Intel](https://github.com/huggingface/optimum-intel?tab=readme-ov-file#openvino) is the interface between the Transformers library and the various model optimization and acceleration tools provided by Intel. HuggingFace models loaded with optimum-intel are automatically optimized for OpenVINO, while being compatible with the Transformers API.\n", + "- Normally, to load a HuggingFace model directly for inference/export, just replace the `AutoModelForXxx` class with the corresponding `OVModelForXxx` class. However, ForMultipleChoice is not yet available so we will use `openvino.convert_model()` after exporting ONNX model\n", + "- We'll use [irfanamal/bert_multiple_choice](https://huggingface.co/irfanamal/bert_multiple_choice) model from HuggingFace as an example\n", + "- We also need the `vocab.txt` saved from `AutoTokenizer`. This is the same for every model, these are assets (saved in `/assets`) needed for tokenization inside Spark NLP." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "TDapJ_09nqXQ", + "outputId": "4799efca-1e55-4c2a-8b50-a606eb4159de" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: pip in /usr/local/lib/python3.10/dist-packages (24.1.2)\n", + "Collecting pip\n", + " Downloading pip-24.2-py3-none-any.whl.metadata (3.6 kB)\n", + "Downloading pip-24.2-py3-none-any.whl (1.8 MB)\n", + "\u001b[2K \u001b[90mโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”\u001b[0m \u001b[32m1.8/1.8 MB\u001b[0m \u001b[31m16.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hInstalling collected packages: pip\n", + " Attempting uninstall: pip\n", + " Found existing installation: pip 24.1.2\n", + " Uninstalling pip-24.1.2:\n", + " Successfully uninstalled pip-24.1.2\n", + "Successfully installed pip-24.2\n", + "\u001b[2K \u001b[90mโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”\u001b[0m \u001b[32m13.2/13.2 MB\u001b[0m \u001b[31m101.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90mโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”\u001b[0m \u001b[32m3.0/3.0 MB\u001b[0m \u001b[31m106.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90mโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”\u001b[0m \u001b[32m9.9/9.9 MB\u001b[0m \u001b[31m97.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90mโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”\u001b[0m \u001b[32m1.1/1.1 MB\u001b[0m \u001b[31m53.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", + "optimum-intel 1.17.0 requires transformers<4.42.0,>=4.36.0, but you have transformers 4.45.2 which is incompatible.\n", + "tensorflow 2.17.0 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.20.3, but you have protobuf 3.20.2 which is incompatible.\n", + "tensorflow-metadata 1.16.1 requires protobuf<4.21,>=3.20.3; python_version < \"3.11\", but you have protobuf 3.20.2 which is incompatible.\u001b[0m\u001b[31m\n", + "\u001b[0m" + ] + } + ], + "source": [ + "!pip install --upgrade pip\n", + "!pip install -q --upgrade transformers[onnx] optimum openvino==2024.1" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 313, + "referenced_widgets": [ + "a676af6e59d649fc8028f60b45618360", + "3fabbd91bb2143f499451f03fc57d889", + "5d7f7aff947045b2bf419fd90bcbdbdf", + "ea7df2af277440dd9f591102ec798868", + "4b6518ad1b5e421aa7489f753e13197c", + "e80b6f3a98cb448f9db6715c4ae563f8", + "3663c51f8f0a49c5bfa3d296a605d0d9", + "86fa70cdb7e74f91852065d89e8b4b75", + "53452866031b42989c0de123c1fddd47", + "cf2b3da978b74c88b730d80991721172", + "c262be5de0424e1fbdea89f03054272b", + "3040bf37a68548258838172d5416f5e5", + "19ffcb2af0a54388973bb5054e94408d", + "ab3d68fd480d4c929da2c1c191f31063", + "e34b67def10f4b59a32813fa7491571d", + "06cb30f831114ec49c41c636d9b28ae8", + "0d96520a7a91433398183f8417741d9e", + "4e056830e74842998962e5f400751181", + "dbc83283f61d4658bf8c76791ff50e2c", + "3d7c147735524dafa2bf21ba06d9a5a6", + "7bc80e615c1f4536968b520d09b24a68", + "e118d02e87e64c3db47400bcdd7e0575", + "d6f1bbcb09b14a539cbc82d8e1ce094a", + "ae7b3c84db4a4392a5822284a4f9ea37", + "1d2367d3c413477c96c7e33269c50fe0", + "058399b8568f4e9cb01eff976aa1636d", + "801fb008d0034785b44a567f4d3978a6", + "f02da6b16ec14f789f7375d3f24bbff6", + "3c6f3c6aa84c4080bb00e7aef6c927ae", + "71efc6b754b0428ab26780f17f2a9491", + "459513449ca944f788ce8c001df892e4", + "52df7d293f9c4d869355135f35689357", + "214dd9970cd642b0a308df2871786667", + "17658bca722f43a8a5f1c74cf3de6dee", + "94f33bc2ef3d472fb66acf755fa1d3e2", + "6d568d7ad7ca4304b9b651217a5cdf1e", + "c29b580c197d4dfc8d2e752e792c233b", + "68b6b1f2bf2d42ed8c2abf248e67e2b0", + "fd9b03af8a9442aeb58ec0a37295ed6e", + "180faef4bcb74fc499178968b9bcfdf9", + "cedc793c23fd4bf49c71f951e1d99dc4", + "114f20bfc72b4c2f8b61b84fda5819b0", + "8a6205d29e724cd396db86730b6e90ee", + "14a8b8a7d17144ac98eb410f6b54deb3", + "3973ff35270d4900a7f71dac81a91f4d", + "5548cf637b96446c8ed3e8a1fcb27125", + "9a46eecbd6164ab4ace6523021345c90", + "2b6982d5762e470fa7361d7a32f80c77", + "3f4f48834f3742b28927e82563565804", + "4c3f39d18e314472900e6b66d939e717", + "b8eda1d738914a87b30dc504a27831a2", + "3e15c3bcac6b4fa98effe04f97e70123", + "f0c48b1731e04832aaf217b20207d1b4", + "d7d517cb53b849fba6e43b5c63c7cce5", + "dfc7ed45143348b8b9946faa220de323", + "df5a86c582f1441598054e0b4a65dd58", + "eeda76eea5ed4e898a835a5fda8bdfd1", + "55fdcce2a9e9439f9cf93fb8ae07ed90", + "56b67191b2eb41f1bd256a098be9b165", + "dc96de4898644214a5bb037e52f492f7", + "753024bad5e742a7be237f82fad56204", + "8fe5cfed789241be8a35a78945f968a0", + "cf184d75226c42c2ad10e5eea229a7b3", + "720cdf0ffc1a42e18e5d93b465516cdb", + "a52dc47f7d134845b70e8d3fd20f1d09", + "076d3c944afa4746b111044b9d1dc4e6" + ] + }, + "id": "_b89GvQKosA0", + "outputId": "b41608f4-14c6-48a8-faa0-e435d6be95f0" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.10/dist-packages/huggingface_hub/utils/_token.py:89: UserWarning: \n", + "The secret `HF_TOKEN` does not exist in your Colab secrets.\n", + "To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.\n", + "You will be able to reuse this secret in all of your notebooks.\n", + "Please note that authentication is recommended but still optional to access public models or datasets.\n", + " warnings.warn(\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "a676af6e59d649fc8028f60b45618360", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "config.json: 0%| | 0.00/670 [00:00 0, chunk -> 0, score -> 0.50161844}, []}]|\n", + "|[{chunk, 0, 6, Germany, {sentence -> 0, chunk -> 0, score -> 0.34054035}, []}] |\n", + "|[{chunk, 0, 3, Lion, {sentence -> 0, chunk -> 0, score -> 0.26681268}, []}] |\n", + "|[{chunk, 0, 3, 90ยฐC, {sentence -> 0, chunk -> 0, score -> 0.36368853}, []}] |\n", + "|[{chunk, 0, 4, Mars, {sentence -> 0, chunk -> 0, score -> 0.37099603}, []}] |\n", + "|[{chunk, 0, 10, Portuguese, {sentence -> 0, chunk -> 0, score -> 0.38210675}, []}] |\n", + "|[{chunk, 0, 11, The Mongols, {sentence -> 0, chunk -> 0, score -> 0.30188978}, []}] |\n", + "|[{chunk, 0, 6, Oxygenm, {sentence -> 0, chunk -> 0, score -> 0.4182295}, []}] |\n", + "|[{chunk, 0, 6, Africa, {sentence -> 0, chunk -> 0, score -> 0.36630076}, []}] |\n", + "|[{chunk, 0, 15, Vincent van Gogh, {sentence -> 0, chunk -> 0, score -> 0.351779}, []}] |\n", + "+-----------------------------------------------------------------------------------------------------------+\n", + "\n" + ] + } + ], + "source": [ + "from sparknlp.base import *\n", + "from sparknlp.annotator import *\n", + "from pyspark.ml import Pipeline, PipelineModel\n", + "\n", + "document_assembler = MultiDocumentAssembler() \\\n", + " .setInputCols([\"question\", \"choices\"]) \\\n", + " .setOutputCols([\"document_question\", \"document_choices\"])\n", + "\n", + "bert_for_multiple_choice = BertForMultipleChoice() \\\n", + " .load(f\"{MODEL_NAME}_spark_nlp_openvino\") \\\n", + " .setInputCols([\"document_question\", \"document_choices\"])\\\n", + " .setOutputCol(\"answer\") \\\n", + " .setBatchSize(4)\n", + "\n", + "pipeline = Pipeline(stages=[document_assembler, bert_for_multiple_choice])\n", + "pipeline_model = pipeline.fit(testing_df)\n", + "\n", + "pipeline_df = pipeline_model.transform(testing_df)\n", + "\n", + "pipeline_df.select(\"answer\").show(truncate=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "lpxiq1igoj6c" + }, + "source": [ + "That's it! You can now go wild and use hundreds of `BertForMultipleChoice` models from HuggingFace ๐Ÿค— in Spark NLP ๐Ÿš€\n" + ] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "gpuType": "A100", + "machine_shape": "hm", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "058399b8568f4e9cb01eff976aa1636d": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_52df7d293f9c4d869355135f35689357", + "placeholder": "โ€‹", + "style": "IPY_MODEL_214dd9970cd642b0a308df2871786667", + "value": "โ€‡314/314โ€‡[00:00<00:00,โ€‡29.2kB/s]" + } + }, + "06cb30f831114ec49c41c636d9b28ae8": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "076d3c944afa4746b111044b9d1dc4e6": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "0d96520a7a91433398183f8417741d9e": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "114f20bfc72b4c2f8b61b84fda5819b0": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "14a8b8a7d17144ac98eb410f6b54deb3": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "17658bca722f43a8a5f1c74cf3de6dee": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_94f33bc2ef3d472fb66acf755fa1d3e2", + "IPY_MODEL_6d568d7ad7ca4304b9b651217a5cdf1e", + "IPY_MODEL_c29b580c197d4dfc8d2e752e792c233b" + ], + "layout": "IPY_MODEL_68b6b1f2bf2d42ed8c2abf248e67e2b0" + } + }, + "180faef4bcb74fc499178968b9bcfdf9": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "19ffcb2af0a54388973bb5054e94408d": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_0d96520a7a91433398183f8417741d9e", + "placeholder": "โ€‹", + "style": "IPY_MODEL_4e056830e74842998962e5f400751181", + "value": "pytorch_model.bin:โ€‡100%" + } + }, + "1d2367d3c413477c96c7e33269c50fe0": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_71efc6b754b0428ab26780f17f2a9491", + "max": 314, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_459513449ca944f788ce8c001df892e4", + "value": 314 + } + }, + "214dd9970cd642b0a308df2871786667": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "2b6982d5762e470fa7361d7a32f80c77": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_d7d517cb53b849fba6e43b5c63c7cce5", + "placeholder": "โ€‹", + "style": "IPY_MODEL_dfc7ed45143348b8b9946faa220de323", + "value": "โ€‡711k/711kโ€‡[00:00<00:00,โ€‡9.81MB/s]" + } + }, + "3040bf37a68548258838172d5416f5e5": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_19ffcb2af0a54388973bb5054e94408d", + "IPY_MODEL_ab3d68fd480d4c929da2c1c191f31063", + "IPY_MODEL_e34b67def10f4b59a32813fa7491571d" + ], + "layout": "IPY_MODEL_06cb30f831114ec49c41c636d9b28ae8" + } + }, + "3663c51f8f0a49c5bfa3d296a605d0d9": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "3973ff35270d4900a7f71dac81a91f4d": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_5548cf637b96446c8ed3e8a1fcb27125", + "IPY_MODEL_9a46eecbd6164ab4ace6523021345c90", + "IPY_MODEL_2b6982d5762e470fa7361d7a32f80c77" + ], + "layout": "IPY_MODEL_3f4f48834f3742b28927e82563565804" + } + }, + "3c6f3c6aa84c4080bb00e7aef6c927ae": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "3d7c147735524dafa2bf21ba06d9a5a6": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "3e15c3bcac6b4fa98effe04f97e70123": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "3f4f48834f3742b28927e82563565804": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "3fabbd91bb2143f499451f03fc57d889": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_e80b6f3a98cb448f9db6715c4ae563f8", + "placeholder": "โ€‹", + "style": "IPY_MODEL_3663c51f8f0a49c5bfa3d296a605d0d9", + "value": "config.json:โ€‡100%" + } + }, + "459513449ca944f788ce8c001df892e4": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "4b6518ad1b5e421aa7489f753e13197c": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "4c3f39d18e314472900e6b66d939e717": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "4e056830e74842998962e5f400751181": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "52df7d293f9c4d869355135f35689357": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "53452866031b42989c0de123c1fddd47": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "5548cf637b96446c8ed3e8a1fcb27125": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_4c3f39d18e314472900e6b66d939e717", + "placeholder": "โ€‹", + "style": "IPY_MODEL_b8eda1d738914a87b30dc504a27831a2", + "value": "tokenizer.json:โ€‡100%" + } + }, + "55fdcce2a9e9439f9cf93fb8ae07ed90": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_cf184d75226c42c2ad10e5eea229a7b3", + "max": 125, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_720cdf0ffc1a42e18e5d93b465516cdb", + "value": 125 + } + }, + "56b67191b2eb41f1bd256a098be9b165": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_a52dc47f7d134845b70e8d3fd20f1d09", + "placeholder": "โ€‹", + "style": "IPY_MODEL_076d3c944afa4746b111044b9d1dc4e6", + "value": "โ€‡125/125โ€‡[00:00<00:00,โ€‡11.2kB/s]" + } + }, + "5d7f7aff947045b2bf419fd90bcbdbdf": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_86fa70cdb7e74f91852065d89e8b4b75", + "max": 670, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_53452866031b42989c0de123c1fddd47", + "value": 670 + } + }, + "68b6b1f2bf2d42ed8c2abf248e67e2b0": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "6d568d7ad7ca4304b9b651217a5cdf1e": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_cedc793c23fd4bf49c71f951e1d99dc4", + "max": 231508, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_114f20bfc72b4c2f8b61b84fda5819b0", + "value": 231508 + } + }, + "71efc6b754b0428ab26780f17f2a9491": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "720cdf0ffc1a42e18e5d93b465516cdb": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "753024bad5e742a7be237f82fad56204": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "7bc80e615c1f4536968b520d09b24a68": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "801fb008d0034785b44a567f4d3978a6": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "86fa70cdb7e74f91852065d89e8b4b75": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "8a6205d29e724cd396db86730b6e90ee": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "8fe5cfed789241be8a35a78945f968a0": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "94f33bc2ef3d472fb66acf755fa1d3e2": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_fd9b03af8a9442aeb58ec0a37295ed6e", + "placeholder": "โ€‹", + "style": "IPY_MODEL_180faef4bcb74fc499178968b9bcfdf9", + "value": "vocab.txt:โ€‡100%" + } + }, + "9a46eecbd6164ab4ace6523021345c90": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_3e15c3bcac6b4fa98effe04f97e70123", + "max": 711494, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_f0c48b1731e04832aaf217b20207d1b4", + "value": 711494 + } + }, + "a52dc47f7d134845b70e8d3fd20f1d09": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "a676af6e59d649fc8028f60b45618360": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_3fabbd91bb2143f499451f03fc57d889", + "IPY_MODEL_5d7f7aff947045b2bf419fd90bcbdbdf", + "IPY_MODEL_ea7df2af277440dd9f591102ec798868" + ], + "layout": "IPY_MODEL_4b6518ad1b5e421aa7489f753e13197c" + } + }, + "ab3d68fd480d4c929da2c1c191f31063": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_dbc83283f61d4658bf8c76791ff50e2c", + "max": 438000433, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_3d7c147735524dafa2bf21ba06d9a5a6", + "value": 438000433 + } + }, + "ae7b3c84db4a4392a5822284a4f9ea37": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_f02da6b16ec14f789f7375d3f24bbff6", + "placeholder": "โ€‹", + "style": "IPY_MODEL_3c6f3c6aa84c4080bb00e7aef6c927ae", + "value": "tokenizer_config.json:โ€‡100%" + } + }, + "b8eda1d738914a87b30dc504a27831a2": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "c262be5de0424e1fbdea89f03054272b": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "c29b580c197d4dfc8d2e752e792c233b": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_8a6205d29e724cd396db86730b6e90ee", + "placeholder": "โ€‹", + "style": "IPY_MODEL_14a8b8a7d17144ac98eb410f6b54deb3", + "value": "โ€‡232k/232kโ€‡[00:00<00:00,โ€‡3.70MB/s]" + } + }, + "cedc793c23fd4bf49c71f951e1d99dc4": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "cf184d75226c42c2ad10e5eea229a7b3": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "cf2b3da978b74c88b730d80991721172": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "d6f1bbcb09b14a539cbc82d8e1ce094a": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_ae7b3c84db4a4392a5822284a4f9ea37", + "IPY_MODEL_1d2367d3c413477c96c7e33269c50fe0", + "IPY_MODEL_058399b8568f4e9cb01eff976aa1636d" + ], + "layout": "IPY_MODEL_801fb008d0034785b44a567f4d3978a6" + } + }, + "d7d517cb53b849fba6e43b5c63c7cce5": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "dbc83283f61d4658bf8c76791ff50e2c": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "dc96de4898644214a5bb037e52f492f7": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "df5a86c582f1441598054e0b4a65dd58": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_eeda76eea5ed4e898a835a5fda8bdfd1", + "IPY_MODEL_55fdcce2a9e9439f9cf93fb8ae07ed90", + "IPY_MODEL_56b67191b2eb41f1bd256a098be9b165" + ], + "layout": "IPY_MODEL_dc96de4898644214a5bb037e52f492f7" + } + }, + "dfc7ed45143348b8b9946faa220de323": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "e118d02e87e64c3db47400bcdd7e0575": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "e34b67def10f4b59a32813fa7491571d": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_7bc80e615c1f4536968b520d09b24a68", + "placeholder": "โ€‹", + "style": "IPY_MODEL_e118d02e87e64c3db47400bcdd7e0575", + "value": "โ€‡438M/438Mโ€‡[00:01<00:00,โ€‡257MB/s]" + } + }, + "e80b6f3a98cb448f9db6715c4ae563f8": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "ea7df2af277440dd9f591102ec798868": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_cf2b3da978b74c88b730d80991721172", + "placeholder": "โ€‹", + "style": "IPY_MODEL_c262be5de0424e1fbdea89f03054272b", + "value": "โ€‡670/670โ€‡[00:00<00:00,โ€‡53.8kB/s]" + } + }, + "eeda76eea5ed4e898a835a5fda8bdfd1": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_753024bad5e742a7be237f82fad56204", + "placeholder": "โ€‹", + "style": "IPY_MODEL_8fe5cfed789241be8a35a78945f968a0", + "value": "special_tokens_map.json:โ€‡100%" + } + }, + "f02da6b16ec14f789f7375d3f24bbff6": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "f0c48b1731e04832aaf217b20207d1b4": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "fd9b03af8a9442aeb58ec0a37295ed6e": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + } + } + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} From e7b269e93d59dde0eb7f0292d42f57c8ed68388a Mon Sep 17 00:00:00 2001 From: Maziyar Panahi Date: Thu, 24 Oct 2024 11:05:33 +0000 Subject: [PATCH 14/24] name llama_cpp to llamacpp engine --- src/main/scala/com/johnsnowlabs/ml/util/ModelEngine.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/scala/com/johnsnowlabs/ml/util/ModelEngine.scala b/src/main/scala/com/johnsnowlabs/ml/util/ModelEngine.scala index e75a3ce29c61a9..ed1f373defd34e 100644 --- a/src/main/scala/com/johnsnowlabs/ml/util/ModelEngine.scala +++ b/src/main/scala/com/johnsnowlabs/ml/util/ModelEngine.scala @@ -43,7 +43,7 @@ final case object Openvino extends ModelEngine { } final case object LlamaCPP extends ModelEngine { - val name = "llama_cpp" + val name = "llamacpp" } final case object Unknown extends ModelEngine { From 55b490cdf2b548382d064449fa990de4588a2962 Mon Sep 17 00:00:00 2001 From: Maziyar Panahi Date: Thu, 24 Oct 2024 12:50:40 +0000 Subject: [PATCH 15/24] add a default engine to the AutoGGUFModel --- .../com/johnsnowlabs/nlp/annotators/seq2seq/AutoGGUFModel.scala | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/AutoGGUFModel.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/AutoGGUFModel.scala index 385b9ddc0e983d..916afb965a2628 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/AutoGGUFModel.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/AutoGGUFModel.scala @@ -156,6 +156,8 @@ class AutoGGUFModel(override val uid: String) private[johnsnowlabs] def setEngine(engineName: String): this.type = set(engine, engineName) + setDefault(engine -> LlamaCPP.name) + override def onWrite(path: String, spark: SparkSession): Unit = { super.onWrite(path, spark) getModelIfNotSet.saveToFile(path) From 7f3704ef74c3b604a02e0131ba3379601bb7c00b Mon Sep 17 00:00:00 2001 From: Maziyar Panahi Date: Thu, 24 Oct 2024 20:42:14 +0200 Subject: [PATCH 16/24] Bump to 5.5.1 and update CHANGELOG [run doc] --- CHANGELOG | 33 ++++ README.md | 14 +- build.sbt | 2 +- docs/_layouts/landing.html | 2 +- docs/en/advanced_settings.md | 6 +- docs/en/concepts.md | 2 +- docs/en/examples.md | 4 +- docs/en/hardware_acceleration.md | 2 +- docs/en/install.md | 144 +++++++++--------- docs/en/spark_nlp.md | 2 +- python/README.md | 14 +- python/docs/conf.py | 2 +- python/setup.py | 2 +- python/sparknlp/__init__.py | 4 +- scripts/colab_setup.sh | 2 +- scripts/kaggle_setup.sh | 2 +- scripts/sagemaker_setup.sh | 2 +- .../scala/com/johnsnowlabs/nlp/SparkNLP.scala | 2 +- .../scala/com/johnsnowlabs/util/Build.scala | 2 +- 19 files changed, 138 insertions(+), 105 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index 8912a2e7a41dac..5a09975bb542a6 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,3 +1,36 @@ +======== +5.5.1 +======== +---------------- +New Features & Enhancements +---------------- +* `BertForMultipleChoice` Transformer Added. Enhanced BERTโ€™s capabilities to handle multiple-choice tasks such as standardized test questions and survey or quiz automation. +* Integrated New Tasks and Documentation: + * Added support and documentation for the following tasks: + * Automatic Speech Recognition + * Dependency Parsing + * Image Captioning + * Image Classification + * Landing Page + * Question Answering + * Summarization + * Table Question Answering + * Text Classification + * Text Generation + * Text Preprocessing + * Token Classification + * Translation + * Zero-Shot Classification + * Zero-Shot Image Classification +* `PromptAssembler` Annotator Introduced. Introduced a new annotator that constructs prompts for LLMs using a chat template and a sequence of messages. Accepts an array of tuples with roles (โ€œsystemโ€, โ€œuserโ€, โ€œassistantโ€) and message texts. Utilizes llama.cpp as a backend for template parsing, supporting basic template applications. + +---------------- +Bug Fixes +---------------- +* Resolved Pretrained Model Loading Issue on DBFS Systems. +* Fixed a bug where pretrained models were not found when running AutoGGUF model pipelines on Databricks due to incorrect path handling of gguf files. + + ======== 5.5.0 ======== diff --git a/README.md b/README.md index 77ae82f10edec9..e5af113964073d 100644 --- a/README.md +++ b/README.md @@ -63,7 +63,7 @@ $ java -version $ conda create -n sparknlp python=3.7 -y $ conda activate sparknlp # spark-nlp by default is based on pyspark 3.x -$ pip install spark-nlp==5.5.0 pyspark==3.3.1 +$ pip install spark-nlp==5.5.1 pyspark==3.3.1 ``` In Python console or Jupyter `Python3` kernel: @@ -129,7 +129,7 @@ For a quick example of using pipelines and models take a look at our official [d ### Apache Spark Support -Spark NLP *5.5.0* has been built on top of Apache Spark 3.4 while fully supports Apache Spark 3.0.x, 3.1.x, 3.2.x, 3.3.x, 3.4.x, and 3.5.x +Spark NLP *5.5.1* has been built on top of Apache Spark 3.4 while fully supports Apache Spark 3.0.x, 3.1.x, 3.2.x, 3.3.x, 3.4.x, and 3.5.x | Spark NLP | Apache Spark 3.5.x | Apache Spark 3.4.x | Apache Spark 3.3.x | Apache Spark 3.2.x | Apache Spark 3.1.x | Apache Spark 3.0.x | Apache Spark 2.4.x | Apache Spark 2.3.x | |-----------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------| @@ -157,7 +157,7 @@ Find out more about 4.x `SparkNLP` versions in our official [documentation](http ### Databricks Support -Spark NLP 5.5.0 has been tested and is compatible with the following runtimes: +Spark NLP 5.5.1 has been tested and is compatible with the following runtimes: | **CPU** | **GPU** | |--------------------|--------------------| @@ -174,7 +174,7 @@ We are compatible with older runtimes. For a full list check databricks support ### EMR Support -Spark NLP 5.5.0 has been tested and is compatible with the following EMR releases: +Spark NLP 5.5.1 has been tested and is compatible with the following EMR releases: | **EMR Release** | |--------------------| @@ -205,7 +205,7 @@ deployed to Maven central. To add any of our packages as a dependency in your ap from our official documentation. If you are interested, there is a simple SBT project for Spark NLP to guide you on how to use it in your -projects [Spark NLP SBT S5.5.0r](https://github.com/maziyarpanahi/spark-nlp-starter) +projects [Spark NLP SBT S5.5.1r](https://github.com/maziyarpanahi/spark-nlp-starter) ### Python @@ -250,7 +250,7 @@ In Spark NLP we can define S3 locations to: Please check [these instructions](https://sparknlp.org/docs/en/install#s3-integration) from our official documentation. -## Document5.5.0 +## Document5.5.1 ### Examples @@ -283,7 +283,7 @@ the Spark NLP library: keywords = {Spark, Natural language processing, Deep learning, Tensorflow, Cluster}, abstract = {Spark NLP is a Natural Language Processing (NLP) library built on top of Apache Spark ML. It provides simple, performant & accurate NLP annotations for machine learning pipelines that can scale easily in a distributed environment. Spark NLP comes with 1100+ pretrained pipelines and models in more than 192+ languages. It supports nearly all the NLP tasks and modules that can be used seamlessly in a cluster. Downloaded more than 2.7 million times and experiencing 9x growth since January 2020, Spark NLP is used by 54% of healthcare organizations as the worldโ€™s most widely used NLP library in the enterprise.} } -}5.5.0 +}5.5.1 ``` ## Community support diff --git a/build.sbt b/build.sbt index 153af1f7a25e24..821cbaa31b11ab 100644 --- a/build.sbt +++ b/build.sbt @@ -6,7 +6,7 @@ name := getPackageName(is_silicon, is_gpu, is_aarch64) organization := "com.johnsnowlabs.nlp" -version := "5.5.0" +version := "5.5.1" (ThisBuild / scalaVersion) := scalaVer diff --git a/docs/_layouts/landing.html b/docs/_layouts/landing.html index c67ff52b47e214..105f3bde451c47 100755 --- a/docs/_layouts/landing.html +++ b/docs/_layouts/landing.html @@ -201,7 +201,7 @@

{{ _section.title }}

{% highlight bash %} # Using PyPI - $ pip install spark-nlp==5.5.0 + $ pip install spark-nlp==5.5.1 # Using Anaconda/Conda $ conda install -c johnsnowlabs spark-nlp diff --git a/docs/en/advanced_settings.md b/docs/en/advanced_settings.md index 71137f181acd1f..f21bf11d56a93a 100644 --- a/docs/en/advanced_settings.md +++ b/docs/en/advanced_settings.md @@ -52,7 +52,7 @@ spark = SparkSession.builder .config("spark.kryoserializer.buffer.max", "2000m") .config("spark.jsl.settings.pretrained.cache_folder", "sample_data/pretrained") .config("spark.jsl.settings.storage.cluster_tmp_dir", "sample_data/storage") - .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:5.5.0") + .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:5.5.1") .getOrCreate() ``` @@ -66,7 +66,7 @@ spark-shell \ --conf spark.kryoserializer.buffer.max=2000M \ --conf spark.jsl.settings.pretrained.cache_folder="sample_data/pretrained" \ --conf spark.jsl.settings.storage.cluster_tmp_dir="sample_data/storage" \ - --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.5.0 + --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.5.1 ``` **pyspark:** @@ -79,7 +79,7 @@ pyspark \ --conf spark.kryoserializer.buffer.max=2000M \ --conf spark.jsl.settings.pretrained.cache_folder="sample_data/pretrained" \ --conf spark.jsl.settings.storage.cluster_tmp_dir="sample_data/storage" \ - --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.5.0 + --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.5.1 ``` **Databricks:** diff --git a/docs/en/concepts.md b/docs/en/concepts.md index 1ddad7b4b08c7c..5d9dddfcb0b550 100644 --- a/docs/en/concepts.md +++ b/docs/en/concepts.md @@ -66,7 +66,7 @@ $ java -version $ conda create -n sparknlp python=3.7 -y $ conda activate sparknlp # spark-nlp by default is based on pyspark 3.x -$ pip install spark-nlp==5.5.0 pyspark==3.3.1 jupyter +$ pip install spark-nlp==5.5.1 pyspark==3.3.1 jupyter $ jupyter notebook ``` diff --git a/docs/en/examples.md b/docs/en/examples.md index 7b672dbf9b67e6..ea8e967ee7be27 100644 --- a/docs/en/examples.md +++ b/docs/en/examples.md @@ -18,7 +18,7 @@ $ java -version # should be Java 8 (Oracle or OpenJDK) $ conda create -n sparknlp python=3.7 -y $ conda activate sparknlp -$ pip install spark-nlp==5.5.0 pyspark==3.3.1 +$ pip install spark-nlp==5.5.1 pyspark==3.3.1 ```
@@ -40,7 +40,7 @@ This script comes with the two options to define `pyspark` and `spark-nlp` versi # -p is for pyspark # -s is for spark-nlp # by default they are set to the latest -!bash colab.sh -p 3.2.3 -s 5.5.0 +!bash colab.sh -p 3.2.3 -s 5.5.1 ``` [Spark NLP quick start on Google Colab](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/quick_start_google_colab.ipynb) is a live demo on Google Colab that performs named entity recognitions and sentiment analysis by using Spark NLP pretrained pipelines. diff --git a/docs/en/hardware_acceleration.md b/docs/en/hardware_acceleration.md index ae521ca155bde2..20703eb8a5a8d9 100644 --- a/docs/en/hardware_acceleration.md +++ b/docs/en/hardware_acceleration.md @@ -50,7 +50,7 @@ Since the new Transformer models such as BERT for Word and Sentence embeddings a | DeBERTa Large | +477%(5.8x) | | Longformer Base | +52%(1.5x) | -Spark NLP 5.5.0 is built with TensorFlow 2.7.1 and the following NVIDIAยฎ software are only required for GPU support: +Spark NLP 5.5.1 is built with TensorFlow 2.7.1 and the following NVIDIAยฎ software are only required for GPU support: - NVIDIAยฎ GPU drivers version 450.80.02 or higher - CUDAยฎ Toolkit 11.2 diff --git a/docs/en/install.md b/docs/en/install.md index dfad2d86bf6f8e..2d3796b14419fc 100644 --- a/docs/en/install.md +++ b/docs/en/install.md @@ -17,27 +17,27 @@ sidebar: ```bash # Install Spark NLP from PyPI -pip install spark-nlp==5.5.0 +pip install spark-nlp==5.5.1 # Install Spark NLP from Anaconda/Conda conda install -c johnsnowlabs spark-nlp # Load Spark NLP with Spark Shell -spark-shell --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.5.0 +spark-shell --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.5.1 # Load Spark NLP with PySpark -pyspark --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.5.0 +pyspark --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.5.1 # Load Spark NLP with Spark Submit -spark-submit --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.5.0 +spark-submit --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.5.1 # Load Spark NLP as external JAR after compiling and building Spark NLP by `sbt assembly` -spark-shell --jars spark-nlp-assembly-5.5.0.jar +spark-shell --jars spark-nlp-assembly-5.5.1.jar ``` **GPU (optional):** -Spark NLP 5.5.0 is built with ONNX 1.17.0 and TensorFlow 2.7.1 deep learning engines. The minimum following NVIDIAยฎ software are only required for GPU support: +Spark NLP 5.5.1 is built with ONNX 1.17.0 and TensorFlow 2.7.1 deep learning engines. The minimum following NVIDIAยฎ software are only required for GPU support: - NVIDIAยฎ GPU drivers version 450.80.02 or higher - CUDAยฎ Toolkit 11.2 @@ -55,7 +55,7 @@ python version, consider sticking to lower versions of Spark.
#### Quick Install -5.5.0 +5.5.1 Let's create a new Conda environment to manage all the dependencies there. You can use Python Virtual Environment if you prefer or not have any environment. ```bash @@ -63,7 +63,7 @@ $ java -version # should be Java 8 (Oracle or OpenJDK) $ conda create -n sparknlp python=3.8 -y $ conda activate sparknlp -$ pip install spark-nlp==5.5.0 pyspark==3.3.1 +$ pip install spark-nlp==5.5.1 pyspark==3.3.1 ``` Of course you will need to have jupyter installed in your system: @@ -92,7 +92,7 @@ spark = sparknlp.start() If you need to manually start SparkSession because you have other configurations and `sparknlp.start()` is not including them, you can manually start the SparkSession with: -```python5.5.0 +```python5.5.1 spark = SparkSession.builder \ .appName("Spark NLP") \ .master("local[*]") \ @@ -100,7 +100,7 @@ spark = SparkSession.builder \ .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \ .config("spark.kryoserializer.buffer.max", "2000M") \ .config("spark.driver.maxResultSize", "0") \ - .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:5.5.0") \ + .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:5.5.1") \ .getOrCreate() ``` If using local jars, you can use `spark.jars` instead for comma-delimited jar files. For cluster setups, of course, @@ -111,18 +111,18 @@ you'll have to put the jars in a reachable location for all driver and executor ### Python without explicit Pyspark installation ### Pip/Conda -5.5.0 +5.5.1 If you installed pyspark through pip/conda, you can install `spark-nlp` through the same channel. Pip: ```bash -pip install spark-nlp==5.5.0 +pip install spark-nlp==5.5.1 ``` Conda: -```bash5.5.0 +```bash5.5.1 conda install -c johnsnowlabs spark-nlp ``` @@ -133,7 +133,7 @@ Then you'll have to create a SparkSession either from Spark NLP: ```python import sparknlp -5.5.0 +5.5.1 spark = sparknlp.start() ``` @@ -144,7 +144,7 @@ import sparknlp from sparknlp.pretrained import PretrainedPipeline # create or get Spark Session -5.5.0 +5.5.1 spark = sparknlp.start() sparknlp.version() @@ -156,28 +156,28 @@ pipeline = PretrainedPipeline('recognize_entities_dl', 'en') result = pipeline.annotate('The Mona Lisa is a 16th century oil painting created by Leonardo') ``` -
5.5.0 +
5.5.1 ## Scala and Java To use Spark NLP you need the following requirements: - Java 8 and 11 -- Apache Spark 3.5.x, 3.4.x, 3.3.x, 3.2.x, 3.1.x, 3.0.x5.5.0 +- Apache Spark 3.5.x, 3.4.x, 3.3.x, 3.2.x, 3.1.x, 3.0.x5.5.1 #### Maven **spark-nlp** on Apache Spark 3.0.x, 3.1.x, 3.2.x, 3.3.x, and 3.4.x The `spark-nlp` has been published to -the [Maven Repository](https://mvnrepository.com/artifact/com.johnsnowla5.5.0p/spark-nlp). +the [Maven Repository](https://mvnrepository.com/artifact/com.johnsnowla5.5.1p/spark-nlp). ```xml com.johnsnowlabs.nlp spark-nlp_2.12 - 5.5.05.5.0 + 5.5.15.5.1 ``` @@ -188,7 +188,7 @@ the [Maven Repository](https://mvnrepository.com/artifact/com.johnsnowla5.5.0p/s com.johnsnowlabs.nlp spark-nlp-gpu_2.12 - 5.5.0 + 5.5.1 ``` @@ -199,7 +199,7 @@ the [Maven Repository](https://mvnrepository.com/artifact/com.johnsnowla5.5.0p/s com.johnsnowlabs.nlp spark-nlp-silicon_2.12 - 5.5.0 + 5.5.1 ``` @@ -210,7 +210,7 @@ the [Maven Repository](https://mvnrepository.com/artifact/com.johnsnowla5.5.0p/s com.johnsnowlabs.nlp spark-nlp-aarch64_2.12 - 5.5.0 + 5.5.1 ``` @@ -222,28 +222,28 @@ the [Maven Repository](https://mvnrepository.com/artifact/com.johnsnowla5.5.0p/s ```scala // https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/spark-nlp -libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp" % "5.5.0" +libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp" % "5.5.1" ``` **spark-nlp-gpu:** ```scala // https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/spark-nlp-gpu -libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-gpu" % "5.5.0" +libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-gpu" % "5.5.1" ``` **spark-nlp-silicon:** ```scala // https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/spark-nlp-silicon -libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-silicon" % "5.5.0" +libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-silicon" % "5.5.1" ``` **spark-nlp-aarch64:** ```scala // https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/spark-nlp-aarch64 -libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-aarch64" % "5.5.0" +libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-aarch64" % "5.5.1" ``` Maven Central: [https://mvnrepository.com/artifact/com.johnsnowlabs.nlp](https://mvnrepository.com/artifact/com.johnsnowlabs.nlp) @@ -259,7 +259,7 @@ at the moment, only the standard variant of the M1 is supported. Other variants M1 Pro/Max/Ultra, M2) will most likely not work. Make sure the following prerequisites are met: -5.5.0 +5.5.1 1. An M1 compiled java version needs to be installed. For example to install the Zulu Java 11 JDK head to [Download Azul JDKs](https://www.azul.com/downloads/?version=java-11-lts&os=macos&architecture=arm-64-bit&package=jdk) and install that java version. @@ -267,7 +267,7 @@ Make sure the following prerequisites are met: rosetta, you can run the following commands in your shell: ```shell - johnsnow@m1mac ~ % cat $(which java) | file -5.5.0 + johnsnow@m1mac ~ % cat $(which java) | file -5.5.1 /dev/stdin: Mach-O 64-bit executable arm64 ``` @@ -305,7 +305,7 @@ rocksdbjni-6.20.3.jar ``` to find the jar you have to remove. After removing the jar, the pipelines should work -as expected.5.5.0 +as expected.5.5.1
@@ -319,11 +319,11 @@ This steps require internet connection. ```sh # CPU -spark-shell --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.5.0 +spark-shell --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.5.1 -pyspark --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.5.0 +pyspark --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.5.1 -spark-submit --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.5.0 +spark-submit --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.5.1 ``` The `spark-nlp` has been published to @@ -332,11 +332,11 @@ the [Maven Repository](https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/s ```sh # GPU -spark-shell --packages com.johnsnowlabs.nlp:spark-nlp-gpu_2.12:5.5.0 +spark-shell --packages com.johnsnowlabs.nlp:spark-nlp-gpu_2.12:5.5.1 -pyspark --packages com.johnsnowlabs.nlp:spark-nlp-gpu_2.12:5.5.0 +pyspark --packages com.johnsnowlabs.nlp:spark-nlp-gpu_2.12:5.5.1 -spark-submit --packages com.johnsnowlabs.nlp:spark-nlp-gpu_2.12:5.5.0 +spark-submit --packages com.johnsnowlabs.nlp:spark-nlp-gpu_2.12:5.5.1 ``` @@ -346,13 +346,13 @@ the [Maven Repository](https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/s ```sh # AArch64 -spark-shell --packages com.johnsnowlabs.nlp:spark-nlp-aarch64_2.12:5.5.0 +spark-shell --packages com.johnsnowlabs.nlp:spark-nlp-aarch64_2.12:5.5.1 -pyspark --packages com.johnsnowlabs.nlp:spark-nlp-aarch64_2.12:5.5.0 +pyspark --packages com.johnsnowlabs.nlp:spark-nlp-aarch64_2.12:5.5.1 -spark-submit --packages com.johnsnowlabs.nlp:spark-nlp-aarch64_2.12:5.5.0 +spark-submit --packages com.johnsnowlabs.nlp:spark-nlp-aarch64_2.12:5.5.1 -```5.5.0 +```5.5.1 The `spark-nlp-aarch64` has been published to the [Maven Repository](https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/spark-nlp-aarch64). @@ -360,11 +360,11 @@ the [Maven Repository](https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/s ```sh # M1/M2 (Apple Silicon) -spark-shell --packages com.johnsnowlabs.nlp:spark-nlp-silicon_2.12:5.5.0 +spark-shell --packages com.johnsnowlabs.nlp:spark-nlp-silicon_2.12:5.5.1 -pyspark --packages com.johnsnowlabs.nlp:spark-nlp-silicon_2.12:5.5.0 +pyspark --packages com.johnsnowlabs.nlp:spark-nlp-silicon_2.12:5.5.1 -spark-submit --packages com.johnsnowlabs.nlp:spark-nlp-silicon_2.12:5.5.0 +spark-submit --packages com.johnsnowlabs.nlp:spark-nlp-silicon_2.12:5.5.1 ``` @@ -374,11 +374,11 @@ the [Maven Repository](https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/s **NOTE**: In case you are using large pretrained models like UniversalSentenceEncoder, you need to have the following set in your SparkSession: -```sh5.5.0 +```sh5.5.1 spark-shell \ --driver-memory 16g \ --conf spark.kryoserializer.buffer.max=2000M \ - --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.5.0 + --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.5.1 ```
@@ -402,7 +402,7 @@ maven coordinates like these: com.johnsnowlabs.nlp spark-nlp-silicon_2.12 - 5.5.0 + 5.5.1 ``` @@ -410,7 +410,7 @@ or in case of sbt: ```scala // https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/spark-nlp -libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-silicon" % "5.5.0" +libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-silicon" % "5.5.1" ``` If everything went well, you can now start Spark NLP with the `m1` flag set to `true`: @@ -473,7 +473,7 @@ as expected. ## Installation for Linux Aarch64 Systems -Starting from version 5.5.0, Spark NLP supports Linux systems running on an aarch64 +Starting from version 5.5.1, Spark NLP supports Linux systems running on an aarch64 processor architecture. The necessary dependencies have been built on Ubuntu 16.04, so a recent system with an environment of at least that will be needed. @@ -484,7 +484,7 @@ to install Spark NLP for your system. ### Starting Spark NLP -Spark NLP needs to be started with the `aarch64` flag set to `true`:5.5.0 +Spark NLP needs to be started with the `aarch64` flag set to `true`:5.5.1 For Scala: @@ -504,7 +504,7 @@ spark = sparknlp.start(aarch64=True)
-## Google 5.5.0 Notebook +## Google 5.5.1 Notebook Google Colab is perhaps the easiest way to get started with spark-nlp. It requires no installation or setup other than having a Google account. @@ -521,7 +521,7 @@ This script comes with the two options to define `pyspark` and `spark-nlp` versi # -p is for pyspark # -s is for spark-nlp # by default they are set to the latest -!wget http://setup.johnsnowlabs.com/colab.sh -O - | bash /dev/stdin -p 3.2.3 -s 5.5.0 +!wget http://setup.johnsnowlabs.com/colab.sh -O - | bash /dev/stdin -p 3.2.3 -s 5.5.1 ``` [Spark NLP quick start on Google Colab](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/quick_start_google_colab.ipynb) is a live demo on Google Colab that performs named entity recognitions and sentiment analysis by using Spark NLP pretrained pipelines. @@ -548,7 +548,7 @@ Use either one of the following options - Add the following Maven Coordinates to the interpreter's library list ```bash -com.johnsnowlabs.nlp:spark-nlp_2.12:5.5.0 +com.johnsnowlabs.nlp:spark-nlp_2.12:5.5.1 ``` - Add a path to pre-built jar from [here](#compiled-jars) in the interpreter's library list making sure the jar is @@ -561,7 +561,7 @@ com.johnsnowlabs.nlp:spark-nlp_2.12:5.5.0 Apart from the previous step, install the python module through pip ```bash -pip install spark-nlp==5.5.0 +pip install spark-nlp==5.5.1 ``` Or you can install `spark-nlp` from inside Zeppelin by using Conda: @@ -569,7 +569,7 @@ Or you can install `spark-nlp` from inside Zeppelin by using Conda: ```bash python.conda install -c johnsnowlabs spark-nlp ``` -5.5.0 +5.5.1 Configure Zeppelin properly, use cells with %spark.pyspark or any interpreter name you chose. Finally, in Zeppelin interpreter settings, make sure you set properly zeppelin.python to the python you want to use and @@ -581,7 +581,7 @@ shown earlier since it includes both scala and python side installation.
## Jupyter Notebook -5.5.0 +5.5.1 **Recommended:** The easiest way to get this done on Linux and macOS is to simply install `spark-nlp` and `pyspark` PyPI packages and @@ -591,7 +591,7 @@ launch the Jupyter from the same Python environment: $ conda create -n sparknlp python=3.8 -y $ conda activate sparknlp # spark-nlp by default is based on pyspark 3.x -$ pip install spark-nlp==5.5.0 pyspark==3.3.1 jupyter +$ pip install spark-nlp==5.5.1 pyspark==3.3.1 jupyter $ jupyter notebook ``` @@ -608,7 +608,7 @@ export PYSPARK_PYTHON=python3 export PYSPARK_DRIVER_PYTHON=jupyter export PYSPARK_DRIVER_PYTHON_OPTS=notebook -pyspark --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.5.0 +pyspark --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.5.1 ``` Alternatively, you can mix in using `--jars` option for pyspark + `pip install spark-nlp` @@ -631,9 +631,9 @@ pointed [here](#python-without-explicit-pyspark-installation) 3. In `Libraries` tab inside your cluster you need to follow these steps: - 3.1. Install New -> PyPI -> `spark-nlp==5.5.0` -> Install + 3.1. Install New -> PyPI -> `spark-nlp==5.5.1` -> Install - 3.2. Install New -> Maven -> Coordinates -> `com.johnsnowlabs.nlp:spark-nlp_2.12:5.5.0` -> Install + 3.2. Install New -> Maven -> Coordinates -> `com.johnsnowlabs.nlp:spark-nlp_2.12:5.5.1` -> Install 4. Now you can attach your notebook to the cluster and use Spark NLP! @@ -686,7 +686,7 @@ A sample of your software configuration in JSON on S3 (must be public access): "spark.kryoserializer.buffer.max": "2000M", "spark.serializer": "org.apache.spark.serializer.KryoSerializer", "spark.driver.maxResultSize": "0", - "spark.jars.packages": "com.johnsnowlabs.nlp:spark-nlp_2.12:5.5.0" + "spark.jars.packages": "com.johnsnowlabs.nlp:spark-nlp_2.12:5.5.1" } }] ``` @@ -695,7 +695,7 @@ A sample of AWS CLI to launch EMR cluster: ```.sh aws emr create-cluster \ ---name "Spark NLP 5.5.0" \ +--name "Spark NLP 5.5.1" \ --release-label emr-6.2.0 \ --applications Name=Hadoop Name=Spark Name=Hive \ --instance-type m4.4xlarge \ @@ -761,7 +761,7 @@ gcloud dataproc clusters create ${CLUSTER_NAME} \ --enable-component-gateway \ --metadata 'PIP_PACKAGES=spark-nlp spark-nlp-display google-cloud-bigquery google-cloud-storage' \ --initialization-actions gs://goog-dataproc-initialization-actions-${REGION}/python/pip-install.sh \ - --properties spark:spark.serializer=org.apache.spark.serializer.KryoSerializer,spark:spark.driver.maxResultSize=0,spark:spark.kryoserializer.buffer.max=2000M,spark:spark.jars.packages=com.johnsnowlabs.nlp:spark-nlp_2.12:5.5.0 + --properties spark:spark.serializer=org.apache.spark.serializer.KryoSerializer,spark:spark.driver.maxResultSize=0,spark:spark.kryoserializer.buffer.max=2000M,spark:spark.jars.packages=com.johnsnowlabs.nlp:spark-nlp_2.12:5.5.1 ``` 2. On an existing one, you need to install spark-nlp and spark-nlp-display packages from PyPI. @@ -771,7 +771,7 @@ gcloud dataproc clusters create ${CLUSTER_NAME} \ ## Apache Spark Support -Spark NLP *5.5.0* has been built on top of Apache Spark 3.4 while fully supports Apache Spark 3.0.x, 3.1.x, 3.2.x, 3.3.x, 3.4.x, and 3.5.x +Spark NLP *5.5.1* has been built on top of Apache Spark 3.4 while fully supports Apache Spark 3.0.x, 3.1.x, 3.2.x, 3.3.x, 3.4.x, and 3.5.x {:.table-model-big} | Spark NLP | Apache Spark 3.5.x | Apache Spark 3.4.x | Apache Spark 3.3.x | Apache Spark 3.2.x | Apache Spark 3.1.x | Apache Spark 3.0.x | Apache Spark 2.4.x | Apache Spark 2.3.x | @@ -807,7 +807,7 @@ Find out more about `Spark NLP` versions from our [release notes](https://github ## Databricks Support -Spark NLP 5.5.0 has been tested and is compatible with the following runtimes: +Spark NLP 5.5.1 has been tested and is compatible with the following runtimes: **CPU:** @@ -854,7 +854,7 @@ Spark NLP 5.5.0 has been tested and is compatible with the following runtimes: - 9.1 ML & GPU - 10.1 ML & GPU -- 10.2 ML & GPU5.5.0 +- 10.2 ML & GPU5.5.1 - 10.3 ML & GPU - 10.4 ML & GPU - 10.5 ML & GPU @@ -887,9 +887,9 @@ Spark NLP 5.5.0 has been tested and is compatible with the following runtimes: 3. In `Libraries` tab inside your cluster you need to follow these steps: - 3.1. Install New -> PyPI -> `spark-nlp` -> Install5.5.0 + 3.1. Install New -> PyPI -> `spark-nlp` -> Install5.5.1 - 3.2. Install New -> Maven -> Coordinates -> `com.johnsnowlabs.nlp:spark-nlp_2.12:5.5.0` -> Install + 3.2. Install New -> Maven -> Coordinates -> `com.johnsnowlabs.nlp:spark-nlp_2.12:5.5.1` -> Install 4. Now you can attach your notebook to the cluster and use Spark NLP! @@ -909,7 +909,7 @@ Note: You can import these notebooks by using their URLs. ## EMR Support -Spark NLP 5.5.0 has been tested and is compatible with the following EMR releases: +Spark NLP 5.5.1 has been tested and is compatible with the following EMR releases: - emr-6.2.0 - emr-6.3.0 @@ -972,7 +972,7 @@ A sample of your software configuration in JSON on S3 (must be public access): "spark.kryoserializer.buffer.max": "2000M", "spark.serializer": "org.apache.spark.serializer.KryoSerializer", "spark.driver.maxResultSize": "0", - "spark.jars.packages": "com.johnsnowlabs.nlp:spark-nlp_2.12:5.5.0" + "spark.jars.packages": "com.johnsnowlabs.nlp:spark-nlp_2.12:5.5.1" } } ] @@ -982,7 +982,7 @@ A sample of AWS CLI to launch EMR cluster: ```sh aws emr create-cluster \ ---name "Spark NLP 5.5.0" \ +--name "Spark NLP 5.5.1" \ --release-label emr-6.2.0 \ --applications Name=Hadoop Name=Spark Name=Hive \ --instance-type m4.4xlarge \ @@ -1247,7 +1247,7 @@ We recommend using `conda` to manage your Python environment on Windows. Now you can use the downloaded binary by navigating to `%SPARK_HOME%\bin` and running -Either create a conda env for python 3.6, install *pyspark==3.3.1 spark-nlp numpy* and use Jupyter/python console, or in the same conda env you can go to spark bin for *pyspark --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.5.0*. +Either create a conda env for python 3.6, install *pyspark==3.3.1 spark-nlp numpy* and use Jupyter/python console, or in the same conda env you can go to spark bin for *pyspark --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.5.1*. @@ -1275,12 +1275,12 @@ spark = SparkSession.builder \ .config("spark.driver.memory","16G")\ .config("spark.driver.maxResultSize", "0") \ .config("spark.kryoserializer.buffer.max", "2000M")\ - .config("spark.jars", "/tmp/spark-nlp-assembly-5.5.0.jar")\ + .config("spark.jars", "/tmp/spark-nlp-assembly-5.5.1.jar")\ .getOrCreate() ``` - You can download provided Fat JARs from each [release notes](https://github.com/JohnSnowLabs/spark-nlp/releases), please pay attention to pick the one that suits your environment depending on the device (CPU/GPU) and Apache Spark version (3.x) -- If you are local, you can load the Fat JAR from your local FileSystem, however, if you are in a cluster setup you need to put the Fat JAR on a distributed FileSystem such as HDFS, DBFS, S3, etc. (i.e., `hdfs:///tmp/spark-nlp-assembly-5.5.0.jar`) +- If you are local, you can load the Fat JAR from your local FileSystem, however, if you are in a cluster setup you need to put the Fat JAR on a distributed FileSystem such as HDFS, DBFS, S3, etc. (i.e., `hdfs:///tmp/spark-nlp-assembly-5.5.1.jar`) Example of using pretrained Models and Pipelines in offline: diff --git a/docs/en/spark_nlp.md b/docs/en/spark_nlp.md index 608edd45c00bd0..9ff302c01d8865 100644 --- a/docs/en/spark_nlp.md +++ b/docs/en/spark_nlp.md @@ -25,7 +25,7 @@ Spark NLP is built on top of **Apache Spark 3.x**. For using Spark NLP you need: **GPU (optional):** -Spark NLP 5.5.0 is built with TensorFlow 2.7.1 and the following NVIDIAยฎ software are only required for GPU support: +Spark NLP 5.5.1 is built with TensorFlow 2.7.1 and the following NVIDIAยฎ software are only required for GPU support: - NVIDIAยฎ GPU drivers version 450.80.02 or higher - CUDAยฎ Toolkit 11.2 diff --git a/python/README.md b/python/README.md index 77ae82f10edec9..e5af113964073d 100644 --- a/python/README.md +++ b/python/README.md @@ -63,7 +63,7 @@ $ java -version $ conda create -n sparknlp python=3.7 -y $ conda activate sparknlp # spark-nlp by default is based on pyspark 3.x -$ pip install spark-nlp==5.5.0 pyspark==3.3.1 +$ pip install spark-nlp==5.5.1 pyspark==3.3.1 ``` In Python console or Jupyter `Python3` kernel: @@ -129,7 +129,7 @@ For a quick example of using pipelines and models take a look at our official [d ### Apache Spark Support -Spark NLP *5.5.0* has been built on top of Apache Spark 3.4 while fully supports Apache Spark 3.0.x, 3.1.x, 3.2.x, 3.3.x, 3.4.x, and 3.5.x +Spark NLP *5.5.1* has been built on top of Apache Spark 3.4 while fully supports Apache Spark 3.0.x, 3.1.x, 3.2.x, 3.3.x, 3.4.x, and 3.5.x | Spark NLP | Apache Spark 3.5.x | Apache Spark 3.4.x | Apache Spark 3.3.x | Apache Spark 3.2.x | Apache Spark 3.1.x | Apache Spark 3.0.x | Apache Spark 2.4.x | Apache Spark 2.3.x | |-----------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------| @@ -157,7 +157,7 @@ Find out more about 4.x `SparkNLP` versions in our official [documentation](http ### Databricks Support -Spark NLP 5.5.0 has been tested and is compatible with the following runtimes: +Spark NLP 5.5.1 has been tested and is compatible with the following runtimes: | **CPU** | **GPU** | |--------------------|--------------------| @@ -174,7 +174,7 @@ We are compatible with older runtimes. For a full list check databricks support ### EMR Support -Spark NLP 5.5.0 has been tested and is compatible with the following EMR releases: +Spark NLP 5.5.1 has been tested and is compatible with the following EMR releases: | **EMR Release** | |--------------------| @@ -205,7 +205,7 @@ deployed to Maven central. To add any of our packages as a dependency in your ap from our official documentation. If you are interested, there is a simple SBT project for Spark NLP to guide you on how to use it in your -projects [Spark NLP SBT S5.5.0r](https://github.com/maziyarpanahi/spark-nlp-starter) +projects [Spark NLP SBT S5.5.1r](https://github.com/maziyarpanahi/spark-nlp-starter) ### Python @@ -250,7 +250,7 @@ In Spark NLP we can define S3 locations to: Please check [these instructions](https://sparknlp.org/docs/en/install#s3-integration) from our official documentation. -## Document5.5.0 +## Document5.5.1 ### Examples @@ -283,7 +283,7 @@ the Spark NLP library: keywords = {Spark, Natural language processing, Deep learning, Tensorflow, Cluster}, abstract = {Spark NLP is a Natural Language Processing (NLP) library built on top of Apache Spark ML. It provides simple, performant & accurate NLP annotations for machine learning pipelines that can scale easily in a distributed environment. Spark NLP comes with 1100+ pretrained pipelines and models in more than 192+ languages. It supports nearly all the NLP tasks and modules that can be used seamlessly in a cluster. Downloaded more than 2.7 million times and experiencing 9x growth since January 2020, Spark NLP is used by 54% of healthcare organizations as the worldโ€™s most widely used NLP library in the enterprise.} } -}5.5.0 +}5.5.1 ``` ## Community support diff --git a/python/docs/conf.py b/python/docs/conf.py index 7801e4e30c260f..e9844b3f983a09 100644 --- a/python/docs/conf.py +++ b/python/docs/conf.py @@ -23,7 +23,7 @@ author = "John Snow Labs" # The full version, including alpha/beta/rc tags -release = "5.5.0" +release = "5.5.1" pyspark_version = "3.2.3" # -- General configuration --------------------------------------------------- diff --git a/python/setup.py b/python/setup.py index cebe55084f4427..29b31cb32d7b0f 100644 --- a/python/setup.py +++ b/python/setup.py @@ -41,7 +41,7 @@ # project code, see # https://packaging.python.org/en/latest/single_source_version.html - version='5.5.0', # Required + version='5.5.1', # Required # This is a one-line description or tagline of what your project does. This # corresponds to the 'Summary' metadata field: diff --git a/python/sparknlp/__init__.py b/python/sparknlp/__init__.py index 69f22315e3e143..a9fd8182649d82 100644 --- a/python/sparknlp/__init__.py +++ b/python/sparknlp/__init__.py @@ -129,7 +129,7 @@ def start(gpu=False, The initiated Spark session. """ - current_version = "5.5.0" + current_version = "5.5.1" if params is None: params = {} @@ -310,4 +310,4 @@ def version(): str The current Spark NLP version. """ - return '5.5.0' + return '5.5.1' diff --git a/scripts/colab_setup.sh b/scripts/colab_setup.sh index c2215254af96aa..09d3f32600429d 100644 --- a/scripts/colab_setup.sh +++ b/scripts/colab_setup.sh @@ -1,7 +1,7 @@ #!/bin/bash #default values for pyspark, spark-nlp, and SPARK_HOME -SPARKNLP="5.5.0" +SPARKNLP="5.5.1" PYSPARK="3.2.3" while getopts s:p:g option diff --git a/scripts/kaggle_setup.sh b/scripts/kaggle_setup.sh index d31052813ec56a..6f3d446c5608bc 100644 --- a/scripts/kaggle_setup.sh +++ b/scripts/kaggle_setup.sh @@ -1,7 +1,7 @@ #!/bin/bash #default values for pyspark, spark-nlp, and SPARK_HOME -SPARKNLP="5.5.0" +SPARKNLP="5.5.1" PYSPARK="3.2.3" while getopts s:p:g option diff --git a/scripts/sagemaker_setup.sh b/scripts/sagemaker_setup.sh index b7b3e172548278..d0bbb43638e3d5 100644 --- a/scripts/sagemaker_setup.sh +++ b/scripts/sagemaker_setup.sh @@ -1,7 +1,7 @@ #!/bin/bash # Default values for pyspark, spark-nlp, and SPARK_HOME -SPARKNLP="5.5.0" +SPARKNLP="5.5.1" PYSPARK="3.2.3" echo "Setup SageMaker for PySpark $PYSPARK and Spark NLP $SPARKNLP" diff --git a/src/main/scala/com/johnsnowlabs/nlp/SparkNLP.scala b/src/main/scala/com/johnsnowlabs/nlp/SparkNLP.scala index 06ac2ae1fd3ae0..452a59eac9540a 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/SparkNLP.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/SparkNLP.scala @@ -20,7 +20,7 @@ import org.apache.spark.sql.SparkSession object SparkNLP { - val currentVersion = "5.5.0" + val currentVersion = "5.5.1" val MavenSpark3 = s"com.johnsnowlabs.nlp:spark-nlp_2.12:$currentVersion" val MavenGpuSpark3 = s"com.johnsnowlabs.nlp:spark-nlp-gpu_2.12:$currentVersion" val MavenSparkSilicon = s"com.johnsnowlabs.nlp:spark-nlp-silicon_2.12:$currentVersion" diff --git a/src/main/scala/com/johnsnowlabs/util/Build.scala b/src/main/scala/com/johnsnowlabs/util/Build.scala index b32776e9241322..d15aded1cf53ac 100644 --- a/src/main/scala/com/johnsnowlabs/util/Build.scala +++ b/src/main/scala/com/johnsnowlabs/util/Build.scala @@ -17,5 +17,5 @@ package com.johnsnowlabs.util object Build { - val version: String = "5.5.0" + val version: String = "5.5.1" } From e2f669c9968abaf76f164a8c68ba7be418efd254 Mon Sep 17 00:00:00 2001 From: github-actions Date: Thu, 24 Oct 2024 18:52:31 +0000 Subject: [PATCH 17/24] Update Scala and Python APIs --- docs/api/com/index.html | 8 +- .../com/johnsnowlabs/client/CloudClient.html | 8 +- .../com/johnsnowlabs/client/CloudManager.html | 8 +- .../johnsnowlabs/client/CloudResources$.html | 8 +- .../com/johnsnowlabs/client/CloudStorage.html | 8 +- .../client/aws/AWSAnonymousCredentials.html | 8 +- .../client/aws/AWSBasicCredentials.html | 8 +- .../johnsnowlabs/client/aws/AWSClient.html | 8 +- .../client/aws/AWSCredentialsProvider.html | 8 +- .../johnsnowlabs/client/aws/AWSGateway.html | 8 +- .../client/aws/AWSProfileCredentials.html | 8 +- .../client/aws/AWSTokenCredentials.html | 8 +- .../client/aws/CredentialParams.html | 8 +- .../johnsnowlabs/client/aws/Credentials.html | 8 +- .../com/johnsnowlabs/client/aws/index.html | 8 +- .../client/azure/AzureClient.html | 8 +- .../client/azure/AzureGateway.html | 8 +- .../com/johnsnowlabs/client/azure/index.html | 8 +- .../johnsnowlabs/client/gcp/GCPClient.html | 8 +- .../johnsnowlabs/client/gcp/GCPGateway.html | 8 +- .../com/johnsnowlabs/client/gcp/index.html | 8 +- docs/api/com/johnsnowlabs/client/index.html | 8 +- .../client/util/CloudHelper$.html | 8 +- .../com/johnsnowlabs/client/util/index.html | 8 +- .../johnsnowlabs/collections/SearchTrie$.html | 8 +- .../johnsnowlabs/collections/SearchTrie.html | 8 +- .../collections/StorageSearchTrie$.html | 8 +- .../collections/StorageSearchTrie.html | 8 +- .../com/johnsnowlabs/collections/index.html | 8 +- docs/api/com/johnsnowlabs/index.html | 8 +- docs/api/com/johnsnowlabs/ml/ai/DeBerta.html | 8 +- .../ml/ai/MergeTokenStrategy$.html | 8 +- .../johnsnowlabs/ml/ai/OpenAICompletion.html | 8 +- .../johnsnowlabs/ml/ai/OpenAIEmbeddings$.html | 8 +- .../johnsnowlabs/ml/ai/OpenAIEmbeddings.html | 8 +- docs/api/com/johnsnowlabs/ml/ai/index.html | 8 +- .../com/johnsnowlabs/ml/ai/model/Choice.html | 8 +- .../ml/ai/model/CompletionResponse.html | 8 +- .../ml/ai/model/EmbeddingData.html | 8 +- .../ml/ai/model/TextEmbeddingResponse.html | 8 +- .../com/johnsnowlabs/ml/ai/model/Usage.html | 8 +- .../johnsnowlabs/ml/ai/model/UsageData.html | 8 +- .../com/johnsnowlabs/ml/ai/model/index.html | 8 +- .../ml/ai/seq2seq/DecoderProcessor.html | 8 +- .../ml/ai/seq2seq/OnnxT5EncoderDecoder.html | 8 +- .../ai/seq2seq/OpenvinoT5EncoderDecoder.html | 8 +- .../ml/ai/seq2seq/T5EncoderDecoder.html | 8 +- .../com/johnsnowlabs/ml/ai/seq2seq/index.html | 8 +- .../ml/ai/t5/OnnxT5EncoderDecoder.html | 8 +- .../t5/T5EncoderDecoder$DecoderProcessor.html | 8 +- .../ml/ai/t5/T5EncoderDecoder.html | 8 +- docs/api/com/johnsnowlabs/ml/ai/t5/index.html | 8 +- .../ml/ai/util/Generation/Generate.html | 8 +- .../ai/util/Generation/GenerationConfig.html | 8 +- .../ml/ai/util/Generation/Logit/Logit.html | 8 +- .../ForcedTokenLogitProcessor.html | 8 +- .../Logit/LogitProcess/LogitProcessor.html | 8 +- .../LogitProcess/MinLengthLogitProcessor.html | 8 +- .../NoRepeatNgramsLogitProcessor.html | 8 +- .../RepetitionPenaltyLogitProcessor.html | 8 +- .../LogitProcess/SuppressLogitProcessor.html | 8 +- .../Generation/Logit/LogitProcess/index.html | 8 +- .../Generation/Logit/LogitProcessorList.html | 8 +- .../Logit/LogitWarper/LogitWarper.html | 8 +- .../LogitWarper/TemperatureLogitWarper.html | 8 +- .../Logit/LogitWarper/TopKLogitWarper.html | 8 +- .../Logit/LogitWarper/TopPLogitWarper.html | 8 +- .../Generation/Logit/LogitWarper/index.html | 8 +- .../ml/ai/util/Generation/Logit/index.html | 8 +- .../Generation/Search/BeamHypotheses.html | 8 +- .../ai/util/Generation/Search/BeamScorer.html | 8 +- .../Generation/Search/BeamSearchScorer.html | 8 +- .../ml/ai/util/Generation/Search/index.html | 8 +- .../ml/ai/util/Generation/index.html | 8 +- .../com/johnsnowlabs/ml/ai/util/index.html | 8 +- docs/api/com/johnsnowlabs/ml/crf/Attr.html | 8 +- .../com/johnsnowlabs/ml/crf/AttrFeature.html | 8 +- .../api/com/johnsnowlabs/ml/crf/AttrStat.html | 8 +- .../com/johnsnowlabs/ml/crf/CrfDataset.html | 8 +- .../com/johnsnowlabs/ml/crf/CrfParams.html | 8 +- .../johnsnowlabs/ml/crf/DatasetEncoder.html | 8 +- .../johnsnowlabs/ml/crf/DatasetMetadata.html | 8 +- .../johnsnowlabs/ml/crf/DatasetReader$.html | 8 +- .../johnsnowlabs/ml/crf/EdgeCalculator$.html | 8 +- .../com/johnsnowlabs/ml/crf/FbCalculator.html | 8 +- .../api/com/johnsnowlabs/ml/crf/Instance.html | 8 +- .../johnsnowlabs/ml/crf/InstanceLabels.html | 8 +- .../johnsnowlabs/ml/crf/L2DecayStrategy.html | 8 +- .../johnsnowlabs/ml/crf/LinearChainCrf.html | 8 +- .../ml/crf/LinearChainCrfModel.html | 8 +- .../ml/crf/SerializedDatasetMetadata.html | 8 +- .../ml/crf/SerializedLinearChainCrfModel.html | 8 +- .../ml/crf/SparseArray$$SeqWrapper.html | 8 +- .../com/johnsnowlabs/ml/crf/SparseArray$.html | 8 +- .../com/johnsnowlabs/ml/crf/SparseArray.html | 8 +- .../ml/crf/TextSentenceAttrs.html | 8 +- .../ml/crf/TextSentenceLabels.html | 8 +- .../com/johnsnowlabs/ml/crf/Transition.html | 8 +- .../com/johnsnowlabs/ml/crf/VectorMath$.html | 8 +- .../com/johnsnowlabs/ml/crf/WordAttrs.html | 8 +- docs/api/com/johnsnowlabs/ml/crf/index.html | 8 +- .../johnsnowlabs/ml/gguf/GGUFWrapper$.html | 24 +- .../com/johnsnowlabs/ml/gguf/GGUFWrapper.html | 8 +- docs/api/com/johnsnowlabs/ml/gguf/index.html | 8 +- docs/api/com/johnsnowlabs/ml/index.html | 8 +- .../com/johnsnowlabs/ml/onnx/OnnxSession.html | 8 +- .../ml/onnx/OnnxWrapper$$DecoderWrappers.html | 8 +- ...er$$EncoderDecoderWithoutPastWrappers.html | 8 +- .../OnnxWrapper$$EncoderDecoderWrappers.html | 8 +- .../johnsnowlabs/ml/onnx/OnnxWrapper$.html | 8 +- .../com/johnsnowlabs/ml/onnx/OnnxWrapper.html | 8 +- .../johnsnowlabs/ml/onnx/ReadOnnxModel.html | 10 +- ...sources$$implicits$$OnnxSessionResult.html | 8 +- .../ml/onnx/TensorResources$$implicits$.html | 8 +- .../ml/onnx/TensorResources$.html | 8 +- .../johnsnowlabs/ml/onnx/TensorResources.html | 8 +- .../johnsnowlabs/ml/onnx/WriteOnnxModel.html | 10 +- docs/api/com/johnsnowlabs/ml/onnx/index.html | 8 +- .../OpenvinoWrapper$$DecoderWrappers.html | 8 +- ...er$$EncoderDecoderWithoutPastWrappers.html | 8 +- ...envinoWrapper$$EncoderDecoderWrappers.html | 8 +- .../ml/openvino/OpenvinoWrapper$.html | 8 +- .../ml/openvino/OpenvinoWrapper.html | 8 +- .../ml/openvino/ReadOpenvinoModel.html | 10 +- .../ml/openvino/WriteOpenvinoModel.html | 10 +- .../com/johnsnowlabs/ml/openvino/index.html | 8 +- .../tensorflow/ClassifierDatasetEncoder.html | 8 +- .../ClassifierDatasetEncoderParams.html | 8 +- .../ml/tensorflow/DatasetEncoderParams.html | 8 +- .../johnsnowlabs/ml/tensorflow/Logging.html | 8 +- .../ml/tensorflow/ModelSignature.html | 8 +- .../johnsnowlabs/ml/tensorflow/NerBatch$.html | 8 +- .../johnsnowlabs/ml/tensorflow/NerBatch.html | 8 +- .../ml/tensorflow/NerDatasetEncoder.html | 8 +- .../ml/tensorflow/ReadTensorflowModel.html | 8 +- .../ml/tensorflow/SentenceGrouper.html | 8 +- .../ml/tensorflow/TensorResources$.html | 8 +- .../ml/tensorflow/TensorResources.html | 8 +- .../ml/tensorflow/TensorflowClassifier.html | 8 +- .../ml/tensorflow/TensorflowWrapper$.html | 8 +- .../ml/tensorflow/TensorflowWrapper.html | 8 +- .../johnsnowlabs/ml/tensorflow/Variables.html | 8 +- .../ml/tensorflow/WriteTensorflowModel.html | 8 +- .../com/johnsnowlabs/ml/tensorflow/index.html | 8 +- .../sentencepiece/ReadSentencePieceModel.html | 8 +- .../sentencepiece/SentencePieceException.html | 8 +- .../sentencepiece/SentencePieceProcessor.html | 8 +- .../sentencepiece/SentencePieceWrapper$.html | 8 +- .../WriteSentencePieceModel.html | 8 +- .../ml/tensorflow/sentencepiece/index.html | 8 +- ...delSignatureConstants$$AttentionMask$.html | 8 +- ...lSignatureConstants$$AttentionMaskV1$.html | 8 +- ...SignatureConstants$$AudioValuesInput$.html | 8 +- ...s$$CachedDecoderEncoderAttentionMask$.html | 8 +- ...stants$$CachedDecoderEncoderInputIds$.html | 8 +- ...eConstants$$CachedDecoderInputCache1$.html | 8 +- ...eConstants$$CachedDecoderInputCache2$.html | 8 +- ...tureConstants$$CachedDecoderInputIds$.html | 8 +- ...natureConstants$$CachedEncoderOutput$.html | 8 +- ...gnatureConstants$$CachedLogitsOutput$.html | 8 +- ...delSignatureConstants$$CachedOutPut2$.html | 8 +- ...delSignatureConstants$$CachedOutput1$.html | 8 +- .../sign/ModelSignatureConstants$$DType$.html | 8 +- ...atureConstants$$DecoderAttentionMask$.html | 8 +- ...ureConstants$$DecoderCachedCache1Key$.html | 8 +- ...ureConstants$$DecoderCachedCache2Key$.html | 8 +- ...ts$$DecoderCachedEncoderAttentionKey$.html | 8 +- ...stants$$DecoderCachedEncoderStateKey$.html | 8 +- ...eConstants$$DecoderCachedInputIdsKey$.html | 8 +- ...natureConstants$$DecoderCachedOutput$.html | 8 +- ...stants$$DecoderCachedOutputCache1Key$.html | 8 +- ...stants$$DecoderCachedOutputCache2Key$.html | 8 +- ...ureConstants$$DecoderCachedOutputKey$.html | 8 +- ...nstants$$DecoderEncoderAttentionMask$.html | 8 +- ...ureConstants$$DecoderEncoderInputIds$.html | 8 +- ...onstants$$DecoderInitOutputCache1Key$.html | 8 +- ...onstants$$DecoderInitOutputCache2Key$.html | 8 +- ...lSignatureConstants$$DecoderInputIds$.html | 8 +- ...delSignatureConstants$$DecoderOutput$.html | 8 +- .../ModelSignatureConstants$$DimCount$.html | 8 +- ...atureConstants$$EncoderAttentionMask$.html | 8 +- ...gnatureConstants$$EncoderContextMask$.html | 8 +- ...lSignatureConstants$$EncoderInputIds$.html | 8 +- ...delSignatureConstants$$EncoderOutput$.html | 8 +- ...lSignatureConstants$$EndLogitsOutput$.html | 8 +- ...ignatureConstants$$InitCachedOutPut2$.html | 8 +- ...ignatureConstants$$InitCachedOutput1$.html | 8 +- ...nts$$InitDecoderEncoderAttentionMask$.html | 8 +- ...onstants$$InitDecoderEncoderInputIds$.html | 8 +- ...natureConstants$$InitDecoderInputIds$.html | 8 +- ...SignatureConstants$$InitLogitsOutput$.html | 8 +- .../ModelSignatureConstants$$InputIds$.html | 8 +- .../ModelSignatureConstants$$InputIdsV1$.html | 8 +- ...lSignatureConstants$$LastHiddenState$.html | 8 +- ...ignatureConstants$$LastHiddenStateV1$.html | 8 +- ...odelSignatureConstants$$LogitsOutput$.html | 8 +- .../sign/ModelSignatureConstants$$Name$.html | 8 +- ...SignatureConstants$$PixelValuesInput$.html | 8 +- ...odelSignatureConstants$$PoolerOutput$.html | 8 +- ...elSignatureConstants$$PoolerOutputV1$.html | 8 +- ...elSignatureConstants$$SerializedSize$.html | 8 +- ...odelSignatureConstants$$ShapeDimList$.html | 8 +- ...ignatureConstants$$StartLogitsOutput$.html | 8 +- ...lSignatureConstants$$TFInfoDescriptor.html | 8 +- ...lSignatureConstants$$TFInfoNameMapper.html | 8 +- ...stants$$TapasLogitsAggregationOutput$.html | 8 +- ...ignatureConstants$$TapasLogitsOutput$.html | 8 +- ...odelSignatureConstants$$TokenTypeIds$.html | 8 +- ...elSignatureConstants$$TokenTypeIdsV1$.html | 8 +- .../sign/ModelSignatureConstants$.html | 8 +- .../sign/ModelSignatureManager$.html | 8 +- .../ml/tensorflow/sign/index.html | 8 +- ...inAlg$$implicits$$ExtendedDenseMatrix.html | 8 +- .../ml/util/LinAlg$$implicits$.html | 8 +- .../api/com/johnsnowlabs/ml/util/LinAlg$.html | 12 +- .../com/johnsnowlabs/ml/util/LlamaCPP$.html | 637 ++++ .../ml/util/LoadExternalModel$.html | 12 +- .../com/johnsnowlabs/ml/util/ModelArch$.html | 12 +- .../com/johnsnowlabs/ml/util/ModelEngine.html | 14 +- docs/api/com/johnsnowlabs/ml/util/ONNX$.html | 12 +- .../com/johnsnowlabs/ml/util/Openvino$.html | 12 +- .../com/johnsnowlabs/ml/util/PyTorch$.html | 12 +- .../com/johnsnowlabs/ml/util/TensorFlow$.html | 12 +- .../com/johnsnowlabs/ml/util/Unknown$.html | 12 +- docs/api/com/johnsnowlabs/ml/util/index.html | 28 +- .../johnsnowlabs/nlp/ActivationFunction$.html | 12 +- .../nlp/Annotation$$AnnotationContainer.html | 8 +- ...nnotation$$extractors$$AnnotationData.html | 8 +- .../nlp/Annotation$$extractors$.html | 8 +- .../api/com/johnsnowlabs/nlp/Annotation$.html | 12 +- docs/api/com/johnsnowlabs/nlp/Annotation.html | 12 +- .../AnnotationAudio$$AnnotationContainer.html | 8 +- .../nlp/AnnotationAudio$$AudioFields.html | 8 +- .../johnsnowlabs/nlp/AnnotationAudio$.html | 12 +- .../com/johnsnowlabs/nlp/AnnotationAudio.html | 12 +- .../AnnotationImage$$AnnotationContainer.html | 8 +- .../nlp/AnnotationImage$$ImageFields.html | 8 +- .../johnsnowlabs/nlp/AnnotationImage$.html | 12 +- .../com/johnsnowlabs/nlp/AnnotationImage.html | 12 +- .../johnsnowlabs/nlp/AnnotatorApproach.html | 12 +- .../com/johnsnowlabs/nlp/AnnotatorModel.html | 14 +- .../com/johnsnowlabs/nlp/AnnotatorType$.html | 12 +- .../com/johnsnowlabs/nlp/AudioAssembler$.html | 12 +- .../com/johnsnowlabs/nlp/AudioAssembler.html | 12 +- docs/api/com/johnsnowlabs/nlp/CanBeLazy.html | 14 +- docs/api/com/johnsnowlabs/nlp/Doc2Chunk$.html | 12 +- docs/api/com/johnsnowlabs/nlp/Doc2Chunk.html | 12 +- .../johnsnowlabs/nlp/DocumentAssembler$.html | 12 +- .../johnsnowlabs/nlp/DocumentAssembler.html | 12 +- .../johnsnowlabs/nlp/EmbeddingsFinisher$.html | 12 +- .../johnsnowlabs/nlp/EmbeddingsFinisher.html | 12 +- .../com/johnsnowlabs/nlp/FeaturesReader.html | 12 +- .../com/johnsnowlabs/nlp/FeaturesWriter.html | 12 +- docs/api/com/johnsnowlabs/nlp/Finisher$.html | 12 +- docs/api/com/johnsnowlabs/nlp/Finisher.html | 12 +- .../com/johnsnowlabs/nlp/GraphFinisher.html | 12 +- .../nlp/HasAudioFeatureProperties.html | 12 +- .../johnsnowlabs/nlp/HasBatchedAnnotate.html | 14 +- .../nlp/HasBatchedAnnotateAudio.html | 12 +- .../nlp/HasBatchedAnnotateImage.html | 12 +- .../nlp/HasCandidateLabelsProperties.html | 12 +- .../nlp/HasCaseSensitiveProperties.html | 14 +- .../HasClassifierActivationProperties.html | 12 +- .../nlp/HasEnableCachingProperties.html | 12 +- docs/api/com/johnsnowlabs/nlp/HasEngine.html | 14 +- .../api/com/johnsnowlabs/nlp/HasFeatures.html | 14 +- .../nlp/HasGeneratorProperties.html | 12 +- .../nlp/HasImageFeatureProperties.html | 12 +- .../nlp/HasInputAnnotationCols.html | 14 +- .../nlp/HasLlamaCppProperties.html | 12 +- .../nlp/HasMultipleInputAnnotationCols.html | 12 +- .../nlp/HasOutputAnnotationCol.html | 14 +- .../nlp/HasOutputAnnotatorType.html | 14 +- .../com/johnsnowlabs/nlp/HasPretrained.html | 14 +- .../HasProtectedParams$ProtectedParam.html | 8 +- .../johnsnowlabs/nlp/HasProtectedParams.html | 12 +- .../com/johnsnowlabs/nlp/HasRecursiveFit.html | 12 +- .../nlp/HasRecursiveTransform.html | 12 +- .../johnsnowlabs/nlp/HasSimpleAnnotate.html | 12 +- .../api/com/johnsnowlabs/nlp/IAnnotation.html | 12 +- .../com/johnsnowlabs/nlp/ImageAssembler$.html | 12 +- .../com/johnsnowlabs/nlp/ImageAssembler.html | 12 +- .../com/johnsnowlabs/nlp/JavaAnnotation.html | 12 +- .../com/johnsnowlabs/nlp/LightPipeline.html | 12 +- .../nlp/MultiDocumentAssembler$.html | 12 +- .../nlp/MultiDocumentAssembler.html | 12 +- .../nlp/ParamsAndFeaturesReadable.html | 14 +- .../nlp/ParamsAndFeaturesWritable.html | 14 +- .../johnsnowlabs/nlp/PromptAssembler$.html | 980 +++++ .../com/johnsnowlabs/nlp/PromptAssembler.html | 2068 +++++++++++ .../com/johnsnowlabs/nlp/RawAnnotator.html | 14 +- .../johnsnowlabs/nlp/RecursivePipeline.html | 12 +- .../nlp/RecursivePipelineModel.html | 12 +- docs/api/com/johnsnowlabs/nlp/SparkNLP$.html | 12 +- .../com/johnsnowlabs/nlp/TableAssembler$.html | 12 +- .../com/johnsnowlabs/nlp/TableAssembler.html | 12 +- .../com/johnsnowlabs/nlp/TokenAssembler$.html | 12 +- .../com/johnsnowlabs/nlp/TokenAssembler.html | 12 +- .../nlp/annotators/Chunk2Doc$.html | 8 +- .../nlp/annotators/Chunk2Doc.html | 8 +- .../nlp/annotators/ChunkTokenizer$.html | 8 +- .../nlp/annotators/ChunkTokenizer.html | 8 +- .../nlp/annotators/ChunkTokenizerModel$.html | 8 +- .../nlp/annotators/ChunkTokenizerModel.html | 8 +- .../johnsnowlabs/nlp/annotators/Chunker$.html | 8 +- .../johnsnowlabs/nlp/annotators/Chunker.html | 8 +- .../nlp/annotators/Date2Chunk$.html | 8 +- .../nlp/annotators/Date2Chunk.html | 8 +- .../nlp/annotators/DateMatcher$.html | 8 +- .../nlp/annotators/DateMatcher.html | 8 +- .../nlp/annotators/DateMatcherTranslator.html | 8 +- .../DateMatcherTranslatorPolicy.html | 8 +- .../nlp/annotators/DateMatcherUtils.html | 8 +- .../DocumentCharacterTextSplitter$.html | 8 +- .../DocumentCharacterTextSplitter.html | 8 +- .../nlp/annotators/DocumentNormalizer$.html | 8 +- .../nlp/annotators/DocumentNormalizer.html | 8 +- .../annotators/DocumentTokenSplitter$.html | 8 +- .../nlp/annotators/DocumentTokenSplitter.html | 8 +- .../nlp/annotators/EnglishStemmer$.html | 8 +- .../nlp/annotators/GraphExtraction.html | 8 +- .../nlp/annotators/Lemmatizer$.html | 8 +- .../nlp/annotators/Lemmatizer.html | 8 +- .../nlp/annotators/LemmatizerModel$.html | 8 +- .../nlp/annotators/LemmatizerModel.html | 8 +- .../nlp/annotators/LookAroundManager$.html | 8 +- .../nlp/annotators/MultiDateMatcher$.html | 8 +- .../nlp/annotators/MultiDateMatcher.html | 8 +- .../nlp/annotators/MultiDatePolicy$.html | 8 +- .../nlp/annotators/NGramGenerator$.html | 8 +- .../nlp/annotators/NGramGenerator.html | 8 +- .../nlp/annotators/Normalizer$.html | 8 +- .../nlp/annotators/Normalizer.html | 8 +- .../nlp/annotators/NormalizerModel$.html | 8 +- ...alizerModel$TokenizerAndNormalizerMap.html | 8 +- .../nlp/annotators/NormalizerModel.html | 8 +- .../annotators/PretrainedAnnotations$.html | 8 +- .../ReadablePretrainedLemmatizer.html | 8 +- ...adablePretrainedStopWordsCleanerModel.html | 8 +- .../ReadablePretrainedTextMatcher.html | 8 +- .../ReadablePretrainedTokenizer.html | 8 +- .../nlp/annotators/RecursiveTokenizer.html | 8 +- .../annotators/RecursiveTokenizerModel$.html | 8 +- .../annotators/RecursiveTokenizerModel.html | 8 +- .../nlp/annotators/RegexMatcher$.html | 8 +- .../nlp/annotators/RegexMatcher.html | 8 +- .../nlp/annotators/RegexMatcherModel$.html | 8 +- .../nlp/annotators/RegexMatcherModel.html | 8 +- .../nlp/annotators/RegexTokenizer$.html | 8 +- .../nlp/annotators/RegexTokenizer.html | 8 +- .../nlp/annotators/SingleDatePolicy$.html | 8 +- .../johnsnowlabs/nlp/annotators/Stemmer$.html | 8 +- .../johnsnowlabs/nlp/annotators/Stemmer.html | 8 +- .../nlp/annotators/StopWordsCleaner$.html | 8 +- .../nlp/annotators/StopWordsCleaner.html | 8 +- .../nlp/annotators/TextMatcher$.html | 8 +- .../nlp/annotators/TextMatcher.html | 8 +- .../nlp/annotators/TextMatcherModel$.html | 8 +- .../nlp/annotators/TextMatcherModel.html | 8 +- .../nlp/annotators/TextSplitter.html | 8 +- .../nlp/annotators/Token2Chunk$.html | 8 +- .../nlp/annotators/Token2Chunk.html | 8 +- .../nlp/annotators/Tokenizer$.html | 8 +- .../nlp/annotators/Tokenizer.html | 8 +- .../nlp/annotators/TokenizerModel$.html | 8 +- .../nlp/annotators/TokenizerModel.html | 8 +- .../nlp/annotators/audio/HubertForCTC$.html | 8 +- .../nlp/annotators/audio/HubertForCTC.html | 8 +- .../audio/ReadHubertForAudioDLModel.html | 8 +- .../audio/ReadWav2Vec2ForAudioDLModel.html | 8 +- .../audio/ReadWhisperForCTCDLModel.html | 8 +- ...ReadablePretrainedHubertForAudioModel.html | 8 +- ...adablePretrainedWav2Vec2ForAudioModel.html | 8 +- .../ReadablePretrainedWhisperForCTCModel.html | 8 +- .../nlp/annotators/audio/Wav2Vec2ForCTC$.html | 8 +- .../nlp/annotators/audio/Wav2Vec2ForCTC.html | 8 +- .../nlp/annotators/audio/WhisperForCTC$.html | 8 +- .../nlp/annotators/audio/WhisperForCTC.html | 8 +- .../audio/feature_extractor/AudioUtils$.html | 8 +- .../PreprocessorAttributes$.html | 8 +- .../WhisperPreprocessor.html | 8 +- .../audio/feature_extractor/index.html | 8 +- .../nlp/annotators/audio/index.html | 8 +- .../nlp/annotators/btm/BigTextMatcher$.html | 8 +- .../nlp/annotators/btm/BigTextMatcher.html | 8 +- .../annotators/btm/BigTextMatcherModel$.html | 8 +- .../annotators/btm/BigTextMatcherModel.html | 8 +- .../btm/ReadablePretrainedBigTextMatcher.html | 8 +- .../nlp/annotators/btm/TMEdgesReadWriter.html | 8 +- .../nlp/annotators/btm/TMEdgesReader.html | 8 +- .../nlp/annotators/btm/TMNodesReader.html | 8 +- .../nlp/annotators/btm/TMNodesWriter.html | 8 +- .../nlp/annotators/btm/TMVocabReadWriter.html | 8 +- .../nlp/annotators/btm/TMVocabReader.html | 8 +- .../nlp/annotators/btm/TrieNode.html | 8 +- .../nlp/annotators/btm/index.html | 8 +- .../dl/AlbertForQuestionAnswering$.html | 20 +- .../dl/AlbertForQuestionAnswering.html | 20 +- .../dl/AlbertForSequenceClassification$.html | 20 +- .../dl/AlbertForSequenceClassification.html | 20 +- .../dl/AlbertForTokenClassification$.html | 20 +- .../dl/AlbertForTokenClassification.html | 20 +- .../dl/AlbertForZeroShotClassification$.html | 20 +- .../dl/AlbertForZeroShotClassification.html | 20 +- .../dl/BartForZeroShotClassification$.html | 20 +- .../dl/BartForZeroShotClassification.html | 20 +- .../classifier/dl/BertForMultipleChoice$.html | 1484 ++++++++ .../classifier/dl/BertForMultipleChoice.html | 3212 +++++++++++++++++ .../dl/BertForQuestionAnswering$.html | 20 +- .../dl/BertForQuestionAnswering.html | 20 +- .../dl/BertForSequenceClassification$.html | 20 +- .../dl/BertForSequenceClassification.html | 20 +- .../dl/BertForTokenClassification$.html | 20 +- .../dl/BertForTokenClassification.html | 20 +- .../dl/BertForZeroShotClassification$.html | 20 +- .../dl/BertForZeroShotClassification.html | 20 +- .../dl/CamemBertForQuestionAnswering$.html | 20 +- .../dl/CamemBertForQuestionAnswering.html | 20 +- .../CamemBertForSequenceClassification$.html | 20 +- .../CamemBertForSequenceClassification.html | 20 +- .../dl/CamemBertForTokenClassification$.html | 20 +- .../dl/CamemBertForTokenClassification.html | 20 +- .../CamemBertForZeroShotClassification$.html | 20 +- .../CamemBertForZeroShotClassification.html | 20 +- .../classifier/dl/ClassifierDLApproach$.html | 20 +- .../classifier/dl/ClassifierDLApproach.html | 20 +- .../classifier/dl/ClassifierDLModel$.html | 20 +- .../classifier/dl/ClassifierDLModel.html | 20 +- .../classifier/dl/ClassifierEncoder.html | 20 +- .../classifier/dl/ClassifierMetrics.html | 20 +- .../dl/DeBertaForQuestionAnswering$.html | 20 +- .../dl/DeBertaForQuestionAnswering.html | 20 +- .../dl/DeBertaForSequenceClassification$.html | 20 +- .../dl/DeBertaForSequenceClassification.html | 20 +- .../dl/DeBertaForTokenClassification$.html | 20 +- .../dl/DeBertaForTokenClassification.html | 20 +- .../dl/DeBertaForZeroShotClassification$.html | 20 +- .../dl/DeBertaForZeroShotClassification.html | 20 +- .../dl/DistilBertForQuestionAnswering$.html | 20 +- .../dl/DistilBertForQuestionAnswering.html | 20 +- .../DistilBertForSequenceClassification$.html | 20 +- .../DistilBertForSequenceClassification.html | 20 +- .../dl/DistilBertForTokenClassification$.html | 20 +- .../dl/DistilBertForTokenClassification.html | 20 +- .../DistilBertForZeroShotClassification$.html | 20 +- .../DistilBertForZeroShotClassification.html | 20 +- .../dl/LongformerForQuestionAnswering$.html | 20 +- .../dl/LongformerForQuestionAnswering.html | 20 +- .../LongformerForSequenceClassification$.html | 20 +- .../LongformerForSequenceClassification.html | 20 +- .../dl/LongformerForTokenClassification$.html | 20 +- .../dl/LongformerForTokenClassification.html | 20 +- .../dl/MPNetForQuestionAnswering$.html | 20 +- .../dl/MPNetForQuestionAnswering.html | 20 +- .../dl/MPNetForSequenceClassification$.html | 20 +- .../dl/MPNetForSequenceClassification.html | 20 +- .../dl/MPNetForTokenClassification$.html | 20 +- .../dl/MPNetForTokenClassification.html | 20 +- .../dl/MultiClassifierDLApproach.html | 20 +- .../dl/MultiClassifierDLModel$.html | 20 +- .../classifier/dl/MultiClassifierDLModel.html | 20 +- ...ReadAlbertForQuestionAnsweringDLModel.html | 20 +- .../dl/ReadAlbertForSequenceDLModel.html | 20 +- .../dl/ReadAlbertForTokenDLModel.html | 20 +- .../dl/ReadAlbertForZeroShotDLModel.html | 20 +- .../dl/ReadBartForZeroShotDLModel.html | 20 +- .../dl/ReadBertForMultipleChoiceModel.html | 1303 +++++++ .../ReadBertForQuestionAnsweringDLModel.html | 20 +- .../dl/ReadBertForSequenceDLModel.html | 20 +- .../dl/ReadBertForTokenDLModel.html | 20 +- .../dl/ReadBertForZeroShotDLModel.html | 20 +- .../dl/ReadCamemBertForQADLModel.html | 20 +- .../dl/ReadCamemBertForSequenceDLModel.html | 20 +- .../dl/ReadCamemBertForTokenDLModel.html | 20 +- ...eadCamemBertForZeroShotClassification.html | 20 +- .../dl/ReadClassifierDLTensorflowModel.html | 20 +- ...eadDeBertaForQuestionAnsweringDLModel.html | 20 +- .../dl/ReadDeBertaForSequenceDLModel.html | 20 +- .../dl/ReadDeBertaForTokenDLModel.html | 20 +- .../dl/ReadDeBertaForZeroShotDLModel.html | 20 +- ...DistilBertForQuestionAnsweringDLModel.html | 20 +- .../dl/ReadDistilBertForSequenceDLModel.html | 20 +- .../dl/ReadDistilBertForTokenDLModel.html | 20 +- .../dl/ReadDistilBertForZeroShotDLModel.html | 20 +- ...LongformerForQuestionAnsweringDLModel.html | 20 +- .../dl/ReadLongformerForSequenceDLModel.html | 20 +- .../dl/ReadLongformerForTokenDLModel.html | 20 +- .../ReadMPNetForQuestionAnsweringDLModel.html | 20 +- .../dl/ReadMPNetForSequenceDLModel.html | 20 +- .../dl/ReadMPNetForTokenDLModel.html | 20 +- .../ReadMultiClassifierDLTensorflowModel.html | 20 +- ...nedCamemBertForZeroShotClassification.html | 20 +- ...eadRoBertaForQuestionAnsweringDLModel.html | 20 +- .../dl/ReadRoBertaForSequenceDLModel.html | 20 +- .../dl/ReadRoBertaForTokenDLModel.html | 20 +- .../dl/ReadRoBertaForZeroShotDLModel.html | 20 +- .../dl/ReadSentimentDLTensorflowModel.html | 20 +- .../ReadTapasForQuestionAnsweringDLModel.html | 20 +- ...XlmRoBertaForQuestionAnsweringDLModel.html | 20 +- .../dl/ReadXlmRoBertaForSequenceDLModel.html | 20 +- .../dl/ReadXlmRoBertaForTokenDLModel.html | 20 +- .../dl/ReadXlmRoBertaForZeroShotDLModel.html | 20 +- .../dl/ReadXlnetForSequenceDLModel.html | 20 +- .../dl/ReadXlnetForTokenDLModel.html | 20 +- .../ReadablePretrainedAlbertForQAModel.html | 20 +- ...dablePretrainedAlbertForSequenceModel.html | 20 +- ...ReadablePretrainedAlbertForTokenModel.html | 20 +- ...dablePretrainedAlbertForZeroShotModel.html | 20 +- ...eadablePretrainedBartForZeroShotModel.html | 20 +- ...ePretrainedBertForMultipleChoiceModel.html | 1345 +++++++ .../dl/ReadablePretrainedBertForQAModel.html | 20 +- ...eadablePretrainedBertForSequenceModel.html | 20 +- .../ReadablePretrainedBertForTokenModel.html | 20 +- ...eadablePretrainedBertForZeroShotModel.html | 20 +- ...ReadablePretrainedCamemBertForQAModel.html | 20 +- ...lePretrainedCamemBertForSequenceModel.html | 20 +- ...dablePretrainedCamemBertForTokenModel.html | 20 +- .../dl/ReadablePretrainedClassifierDL.html | 20 +- .../ReadablePretrainedDeBertaForQAModel.html | 20 +- ...ablePretrainedDeBertaForSequenceModel.html | 20 +- ...eadablePretrainedDeBertaForTokenModel.html | 20 +- ...ablePretrainedDeBertaForZeroShotModel.html | 20 +- ...eadablePretrainedDistilBertForQAModel.html | 20 +- ...ePretrainedDistilBertForSequenceModel.html | 20 +- ...ablePretrainedDistilBertForTokenModel.html | 20 +- ...ePretrainedDistilBertForZeroShotModel.html | 20 +- ...eadablePretrainedLongformerForQAModel.html | 20 +- ...ePretrainedLongformerForSequenceModel.html | 20 +- ...ablePretrainedLongformerForTokenModel.html | 20 +- .../dl/ReadablePretrainedMPNetForQAModel.html | 20 +- ...adablePretrainedMPNetForSequenceModel.html | 20 +- ...eadablePretrainedMPNetForTokenDLModel.html | 20 +- .../ReadablePretrainedMultiClassifierDL.html | 20 +- .../ReadablePretrainedRoBertaForQAModel.html | 20 +- ...ablePretrainedRoBertaForSequenceModel.html | 20 +- ...eadablePretrainedRoBertaForTokenModel.html | 20 +- ...ablePretrainedRoBertaForZeroShotModel.html | 20 +- .../dl/ReadablePretrainedSentimentDL.html | 20 +- .../dl/ReadablePretrainedTapasForQAModel.html | 20 +- ...eadablePretrainedXlmRoBertaForQAModel.html | 20 +- ...ePretrainedXlmRoBertaForSequenceModel.html | 20 +- ...ablePretrainedXlmRoBertaForTokenModel.html | 20 +- ...ePretrainedXlmRoBertaForZeroShotModel.html | 20 +- ...adablePretrainedXlnetForSequenceModel.html | 20 +- .../ReadablePretrainedXlnetForTokenModel.html | 20 +- .../dl/RoBertaForQuestionAnswering$.html | 20 +- .../dl/RoBertaForQuestionAnswering.html | 20 +- .../dl/RoBertaForSequenceClassification$.html | 20 +- .../dl/RoBertaForSequenceClassification.html | 20 +- .../dl/RoBertaForTokenClassification$.html | 20 +- .../dl/RoBertaForTokenClassification.html | 20 +- .../dl/RoBertaForZeroShotClassification$.html | 20 +- .../dl/RoBertaForZeroShotClassification.html | 20 +- .../classifier/dl/SentimentApproach$.html | 20 +- .../classifier/dl/SentimentDLApproach.html | 20 +- .../classifier/dl/SentimentDLModel$.html | 20 +- .../classifier/dl/SentimentDLModel.html | 20 +- .../dl/TapasForQuestionAnswering$.html | 20 +- .../dl/TapasForQuestionAnswering.html | 20 +- .../dl/XlmRoBertaForQuestionAnswering$.html | 20 +- .../dl/XlmRoBertaForQuestionAnswering.html | 20 +- .../XlmRoBertaForSequenceClassification$.html | 20 +- .../XlmRoBertaForSequenceClassification.html | 20 +- .../dl/XlmRoBertaForTokenClassification$.html | 20 +- .../dl/XlmRoBertaForTokenClassification.html | 20 +- .../XlmRoBertaForZeroShotClassification$.html | 20 +- .../XlmRoBertaForZeroShotClassification.html | 20 +- .../dl/XlnetForSequenceClassification$.html | 20 +- .../dl/XlnetForSequenceClassification.html | 20 +- .../dl/XlnetForTokenClassification$.html | 20 +- .../dl/XlnetForTokenClassification.html | 20 +- .../nlp/annotators/classifier/dl/index.html | 125 +- .../nlp/annotators/classifier/index.html | 8 +- .../nlp/annotators/common/Annotated$.html | 8 +- .../nlp/annotators/common/Annotated.html | 8 +- .../nlp/annotators/common/ChunkSplit$.html | 8 +- .../nlp/annotators/common/ConllSentence.html | 8 +- .../DatasetHelpers$$DataFrameHelper.html | 8 +- .../annotators/common/DatasetHelpers$.html | 8 +- .../annotators/common/DependencyParsed$.html | 8 +- .../common/DependencyParsedSentence.html | 8 +- .../common/EmbeddingsWithSentence$.html | 8 +- .../annotators/common/IndexedTaggedWord.html | 8 +- .../nlp/annotators/common/IndexedToken.html | 8 +- .../nlp/annotators/common/InfixToken$.html | 8 +- .../nlp/annotators/common/InfixToken.html | 8 +- .../LabeledDependency$$DependencyInfo.html | 8 +- .../annotators/common/LabeledDependency$.html | 8 +- .../nlp/annotators/common/NerTagged$.html | 8 +- .../nlp/annotators/common/PosTagged$.html | 8 +- .../nlp/annotators/common/PrefixedToken$.html | 8 +- .../nlp/annotators/common/PrefixedToken.html | 8 +- .../common/PreprocessingParser.html | 8 +- .../nlp/annotators/common/Sentence$.html | 8 +- .../nlp/annotators/common/Sentence.html | 8 +- .../nlp/annotators/common/SentenceSplit$.html | 8 +- .../nlp/annotators/common/SuffixedToken$.html | 8 +- .../nlp/annotators/common/SuffixedToken.html | 8 +- .../nlp/annotators/common/TableData$.html | 8 +- .../nlp/annotators/common/TableData.html | 8 +- .../nlp/annotators/common/Tagged.html | 8 +- .../annotators/common/TaggedSentence$.html | 8 +- .../nlp/annotators/common/TaggedSentence.html | 8 +- .../nlp/annotators/common/TaggedWord.html | 8 +- .../nlp/annotators/common/TokenPiece.html | 8 +- .../common/TokenPieceEmbeddings$.html | 8 +- .../common/TokenPieceEmbeddings.html | 8 +- .../annotators/common/TokenizedSentence.html | 8 +- .../common/TokenizedWithSentence$.html | 8 +- .../annotators/common/WordWithDependency.html | 8 +- .../common/WordpieceEmbeddingsSentence$.html | 8 +- .../common/WordpieceEmbeddingsSentence.html | 8 +- .../common/WordpieceTokenized$.html | 8 +- .../common/WordpieceTokenizedSentence.html | 8 +- .../nlp/annotators/common/index.html | 8 +- .../ReadSpanBertCorefTensorflowModel.html | 8 +- .../ReadablePretrainedSpanBertCorefModel.html | 8 +- .../annotators/coref/SpanBertCorefModel$.html | 8 +- .../annotators/coref/SpanBertCorefModel.html | 8 +- .../nlp/annotators/coref/index.html | 8 +- .../cv/CLIPForZeroShotClassification$.html | 8 +- .../cv/CLIPForZeroShotClassification.html | 8 +- .../cv/ConvNextForImageClassification$.html | 8 +- .../cv/ConvNextForImageClassification.html | 8 +- .../nlp/annotators/cv/HasRescaleFactor.html | 8 +- ...eadCLIPForZeroShotClassificationModel.html | 8 +- .../cv/ReadConvNextForImageDLModel.html | 8 +- .../cv/ReadSwinForImageDLModel.html | 8 +- .../annotators/cv/ReadViTForImageDLModel.html | 8 +- .../cv/ReadVisionEncoderDecoderDLModel.html | 8 +- ...nedCLIPForZeroShotClassificationModel.html | 8 +- ...adablePretrainedConvNextForImageModel.html | 8 +- .../ReadablePretrainedSwinForImageModel.html | 8 +- .../ReadablePretrainedViTForImageModel.html | 8 +- ...lePretrainedVisionEncoderDecoderModel.html | 8 +- .../cv/SwinForImageClassification$.html | 8 +- .../cv/SwinForImageClassification.html | 8 +- .../cv/ViTForImageClassification$.html | 8 +- .../cv/ViTForImageClassification.html | 8 +- ...sionEncoderDecoderForImageCaptioning$.html | 8 +- ...isionEncoderDecoderForImageCaptioning.html | 8 +- .../johnsnowlabs/nlp/annotators/cv/index.html | 8 +- .../er/AhoCorasickAutomaton$Node.html | 8 +- .../annotators/er/AhoCorasickAutomaton.html | 8 +- .../nlp/annotators/er/EntityPattern.html | 8 +- .../annotators/er/EntityRulerApproach.html | 8 +- .../annotators/er/EntityRulerFeatures.html | 8 +- .../nlp/annotators/er/EntityRulerModel$.html | 8 +- .../nlp/annotators/er/EntityRulerModel.html | 8 +- .../nlp/annotators/er/EntityRulerUtil$.html | 8 +- .../annotators/er/FlattenEntityPattern.html | 8 +- .../nlp/annotators/er/PatternsReadWriter.html | 8 +- .../nlp/annotators/er/PatternsReader.html | 8 +- .../er/ReadablePretrainedEntityRuler.html | 8 +- .../er/RegexPatternsReadWriter.html | 8 +- .../annotators/er/RegexPatternsReader.html | 8 +- .../johnsnowlabs/nlp/annotators/er/index.html | 8 +- .../johnsnowlabs/nlp/annotators/index.html | 8 +- .../nlp/annotators/keyword/index.html | 8 +- .../keyword/yake/YakeKeywordExtraction$.html | 8 +- .../keyword/yake/YakeKeywordExtraction.html | 8 +- .../annotators/keyword/yake/YakeParams.html | 8 +- .../nlp/annotators/keyword/yake/index.html | 8 +- .../annotators/keyword/yake/util/Token.html | 8 +- .../keyword/yake/util/Utilities$.html | 8 +- .../annotators/keyword/yake/util/index.html | 8 +- .../annotators/ld/dl/LanguageDetectorDL$.html | 8 +- .../annotators/ld/dl/LanguageDetectorDL.html | 8 +- ...ReadLanguageDetectorDLTensorflowModel.html | 8 +- ...ablePretrainedLanguageDetectorDLModel.html | 8 +- .../nlp/annotators/ld/dl/index.html | 8 +- .../johnsnowlabs/nlp/annotators/ld/index.html | 8 +- .../nlp/annotators/ner/ModelMetrics$.html | 8 +- .../nlp/annotators/ner/NamedEntity.html | 8 +- .../nlp/annotators/ner/NerApproach.html | 8 +- .../nlp/annotators/ner/NerConverter$.html | 8 +- .../nlp/annotators/ner/NerConverter.html | 8 +- .../nlp/annotators/ner/NerOverwriter$.html | 8 +- .../nlp/annotators/ner/NerOverwriter.html | 8 +- .../nlp/annotators/ner/NerTagsEncoding$.html | 8 +- .../nlp/annotators/ner/Verbose$.html | 8 +- .../ner/crf/DictionaryFeatures$.html | 8 +- .../ner/crf/DictionaryFeatures.html | 8 +- .../ner/crf/FeatureGenerator$TokenType$.html | 8 +- .../annotators/ner/crf/FeatureGenerator.html | 8 +- .../annotators/ner/crf/NerCrfApproach$.html | 8 +- .../annotators/ner/crf/NerCrfApproach.html | 8 +- .../nlp/annotators/ner/crf/NerCrfModel$.html | 8 +- .../nlp/annotators/ner/crf/NerCrfModel.html | 8 +- .../ner/crf/ReadablePretrainedNerCrf.html | 8 +- .../nlp/annotators/ner/crf/index.html | 8 +- .../nlp/annotators/ner/dl/LoadsContrib$.html | 8 +- .../nlp/annotators/ner/dl/NerDLApproach$.html | 8 +- .../nlp/annotators/ner/dl/NerDLApproach.html | 8 +- .../nlp/annotators/ner/dl/NerDLModel$.html | 8 +- .../nlp/annotators/ner/dl/NerDLModel.html | 8 +- .../ner/dl/NerDLModelPythonReader$.html | 8 +- .../ner/dl/ReadZeroShotNerDLModel.html | 8 +- .../ner/dl/ReadablePretrainedNerDL.html | 8 +- .../ner/dl/ReadablePretrainedZeroShotNer.html | 8 +- .../nlp/annotators/ner/dl/ReadsNERGraph.html | 8 +- .../annotators/ner/dl/WithGraphResolver.html | 8 +- .../annotators/ner/dl/ZeroShotNerModel$.html | 8 +- .../annotators/ner/dl/ZeroShotNerModel.html | 8 +- .../nlp/annotators/ner/dl/index.html | 8 +- .../nlp/annotators/ner/index.html | 8 +- ...lizableFormat$$SerializableDateFormat.html | 8 +- .../AnnotatorParam$SerializableFormat$.html | 8 +- .../nlp/annotators/param/AnnotatorParam.html | 8 +- .../annotators/param/EvaluationDLParams.html | 8 +- .../param/ExternalResourceParam.html | 8 +- .../param/SerializedAnnotatorComponent.html | 8 +- .../param/WritableAnnotatorComponent.html | 8 +- .../nlp/annotators/param/index.html | 8 +- .../parser/dep/DependencyParserApproach$.html | 8 +- .../parser/dep/DependencyParserApproach.html | 8 +- .../parser/dep/DependencyParserModel$.html | 8 +- .../parser/dep/DependencyParserModel.html | 8 +- .../GreedyTransition/DependencyMaker$.html | 8 +- .../DependencyMaker$CurrentState.html | 8 +- .../DependencyMaker$ParseState.html | 8 +- .../dep/GreedyTransition/DependencyMaker.html | 8 +- .../GreedyTransitionApproach$.html | 8 +- .../parser/dep/GreedyTransition/index.html | 8 +- .../GreedyTransition/package$$Feature.html | 8 +- .../GreedyTransition/package$$WordData.html | 8 +- .../parser/dep/Perceptron$WeightLearner.html | 8 +- .../nlp/annotators/parser/dep/Perceptron.html | 8 +- .../dep/ReadablePretrainedDependency.html | 8 +- .../annotators/parser/dep/TagDictionary$.html | 8 +- .../nlp/annotators/parser/dep/Tagger$.html | 8 +- .../nlp/annotators/parser/dep/Tagger.html | 8 +- .../nlp/annotators/parser/dep/index.html | 8 +- .../nlp/annotators/parser/index.html | 8 +- .../annotators/parser/typdep/ConllData.html | 8 +- .../parser/typdep/DependencyArcList.html | 8 +- .../parser/typdep/DependencyInstance.html | 8 +- .../parser/typdep/DependencyPipe.html | 8 +- .../parser/typdep/LocalFeatureData.html | 8 +- .../parser/typdep/LowRankTensor.html | 8 +- .../nlp/annotators/parser/typdep/Options.html | 8 +- .../annotators/parser/typdep/Parameters.html | 8 +- .../parser/typdep/PredictionParameters.html | 8 +- .../ReadablePretrainedTypedDependency.html | 8 +- .../parser/typdep/TrainDependencies.html | 8 +- .../annotators/parser/typdep/TrainFile.html | 8 +- .../parser/typdep/TypedDependencyParser.html | 8 +- .../TypedDependencyParserApproach$.html | 8 +- .../typdep/TypedDependencyParserApproach.html | 8 +- .../typdep/TypedDependencyParserModel$.html | 8 +- .../typdep/TypedDependencyParserModel.html | 8 +- .../typdep/feature/FeatureTemplate.html | 8 +- .../feature/SyntacticFeatureFactory.html | 8 +- .../parser/typdep/feature/index.html | 8 +- .../nlp/annotators/parser/typdep/index.html | 8 +- .../parser/typdep/io/Conll09Reader.html | 8 +- .../parser/typdep/io/ConllUReader.html | 8 +- .../parser/typdep/io/ConllWriter.html | 8 +- .../parser/typdep/io/DependencyReader.html | 8 +- .../annotators/parser/typdep/io/index.html | 8 +- .../parser/typdep/util/Alphabet.html | 8 +- .../parser/typdep/util/Collector.html | 8 +- .../parser/typdep/util/DependencyLabel.html | 8 +- .../parser/typdep/util/Dictionary.html | 8 +- .../parser/typdep/util/DictionarySet.html | 8 +- .../parser/typdep/util/FeatureVector.html | 8 +- .../parser/typdep/util/ScoreCollector.html | 8 +- .../annotators/parser/typdep/util/Utils.html | 8 +- .../annotators/parser/typdep/util/index.html | 8 +- .../nlp/annotators/pos/index.html | 8 +- .../pos/perceptron/AveragedPerceptron.html | 8 +- .../pos/perceptron/PerceptronApproach$.html | 8 +- .../pos/perceptron/PerceptronApproach.html | 8 +- .../PerceptronApproachDistributed$.html | 8 +- .../PerceptronApproachDistributed.html | 8 +- .../pos/perceptron/PerceptronModel$.html | 8 +- .../pos/perceptron/PerceptronModel.html | 8 +- .../perceptron/PerceptronPredictionUtils.html | 8 +- .../perceptron/PerceptronTrainingUtils.html | 8 +- .../pos/perceptron/PerceptronUtils.html | 8 +- .../ReadablePretrainedPerceptron.html | 8 +- .../StringMapStringDoubleAccumulator.html | 8 +- .../perceptron/TrainingPerceptronLegacy.html | 8 +- .../TupleKeyLongDoubleMapAccumulator.html | 8 +- .../nlp/annotators/pos/perceptron/index.html | 8 +- .../sbd/SentenceDetectorParams.html | 8 +- .../nlp/annotators/sbd/index.html | 8 +- .../sbd/pragmatic/CustomPragmaticMethod.html | 8 +- .../sbd/pragmatic/DefaultPragmaticMethod.html | 8 +- .../sbd/pragmatic/MixedPragmaticMethod.html | 8 +- .../pragmatic/PragmaticContentFormatter$.html | 8 +- .../pragmatic/PragmaticContentFormatter.html | 8 +- .../sbd/pragmatic/PragmaticDictionaries$.html | 8 +- .../sbd/pragmatic/PragmaticMethod.html | 8 +- .../pragmatic/PragmaticSentenceExtractor.html | 8 +- .../sbd/pragmatic/PragmaticSymbols$.html | 8 +- .../annotators/sbd/pragmatic/RuleSymbols.html | 8 +- .../sbd/pragmatic/SentenceDetector$.html | 8 +- .../sbd/pragmatic/SentenceDetector.html | 8 +- .../nlp/annotators/sbd/pragmatic/index.html | 8 +- .../nlp/annotators/sda/index.html | 8 +- .../sda/pragmatic/PragmaticScorer.html | 8 +- .../sda/pragmatic/SentimentDetector$.html | 8 +- .../sda/pragmatic/SentimentDetector.html | 8 +- .../pragmatic/SentimentDetectorModel$.html | 8 +- .../sda/pragmatic/SentimentDetectorModel.html | 8 +- .../nlp/annotators/sda/pragmatic/index.html | 8 +- .../sda/vivekn/ReadablePretrainedVivekn.html | 8 +- .../sda/vivekn/ViveknSentimentApproach.html | 8 +- .../sda/vivekn/ViveknSentimentModel$.html | 8 +- .../sda/vivekn/ViveknSentimentModel.html | 8 +- .../sda/vivekn/ViveknSentimentUtils.html | 8 +- .../nlp/annotators/sda/vivekn/index.html | 8 +- .../sentence_detector_dl/Metrics.html | 8 +- .../ReadablePretrainedSentenceDetectorDL.html | 8 +- .../ReadsSentenceDetectorDLGraph.html | 8 +- .../SentenceDetectorDLApproach.html | 8 +- .../SentenceDetectorDLEncoder$.html | 8 +- .../SentenceDetectorDLEncoder.html | 8 +- .../SentenceDetectorDLEncoderParam.html | 8 +- .../SentenceDetectorDLModel$.html | 8 +- .../SentenceDetectorDLModel.html | 8 +- .../sentence_detector_dl/index.html | 8 +- .../annotators/seq2seq/AutoGGUFModel$.html | 8 +- .../nlp/annotators/seq2seq/AutoGGUFModel.html | 8 +- .../annotators/seq2seq/BartTransformer$.html | 8 +- .../annotators/seq2seq/BartTransformer.html | 8 +- .../annotators/seq2seq/CPMTransformer$.html | 8 +- .../annotators/seq2seq/CPMTransformer.html | 8 +- .../annotators/seq2seq/GPT2Transformer$.html | 8 +- .../annotators/seq2seq/GPT2Transformer.html | 8 +- .../seq2seq/LLAMA2Transformer$.html | 8 +- .../annotators/seq2seq/LLAMA2Transformer.html | 8 +- .../seq2seq/LLAMA3Transformer$.html | 8 +- .../annotators/seq2seq/LLAMA3Transformer.html | 8 +- .../seq2seq/M2M100Transformer$.html | 8 +- .../annotators/seq2seq/M2M100Transformer.html | 8 +- .../seq2seq/MarianTransformer$.html | 8 +- .../annotators/seq2seq/MarianTransformer.html | 8 +- .../seq2seq/MistralTransformer$.html | 8 +- .../seq2seq/MistralTransformer.html | 8 +- .../annotators/seq2seq/NLLBTransformer$.html | 8 +- .../annotators/seq2seq/NLLBTransformer.html | 8 +- .../annotators/seq2seq/Phi2Transformer$.html | 8 +- .../annotators/seq2seq/Phi2Transformer.html | 8 +- .../annotators/seq2seq/Phi3Transformer$.html | 8 +- .../annotators/seq2seq/Phi3Transformer.html | 8 +- .../annotators/seq2seq/QwenTransformer$.html | 8 +- .../annotators/seq2seq/QwenTransformer.html | 8 +- .../annotators/seq2seq/ReadAutoGGUFModel.html | 8 +- .../seq2seq/ReadBartTransformerDLModel.html | 8 +- .../seq2seq/ReadCPMTransformerDLModel.html | 8 +- .../seq2seq/ReadGPT2TransformerDLModel.html | 8 +- .../seq2seq/ReadLLAMA2TransformerDLModel.html | 8 +- .../seq2seq/ReadLLAMA3TransformerDLModel.html | 8 +- .../seq2seq/ReadM2M100TransformerDLModel.html | 8 +- .../seq2seq/ReadMarianMTDLModel.html | 8 +- .../ReadMistralTransformerDLModel.html | 8 +- .../seq2seq/ReadNLLBTransformerDLModel.html | 8 +- .../seq2seq/ReadPhi2TransformerDLModel.html | 8 +- .../seq2seq/ReadPhi3TransformerDLModel.html | 8 +- .../seq2seq/ReadQwenTransformerDLModel.html | 8 +- .../ReadStarCoderTransformerDLModel.html | 8 +- .../seq2seq/ReadT5TransformerDLModel.html | 8 +- .../ReadablePretrainedAutoGGUFModel.html | 8 +- ...eadablePretrainedBartTransformerModel.html | 8 +- ...ReadablePretrainedCPMTransformerModel.html | 8 +- ...eadablePretrainedGPT2TransformerModel.html | 8 +- ...dablePretrainedLLAMA2TransformerModel.html | 8 +- ...dablePretrainedLLAMA3TransformerModel.html | 8 +- ...dablePretrainedM2M100TransformerModel.html | 8 +- .../ReadablePretrainedMarianMTModel.html | 8 +- ...ablePretrainedMistralTransformerModel.html | 8 +- ...eadablePretrainedNLLBTransformerModel.html | 8 +- ...eadablePretrainedPhi2TransformerModel.html | 8 +- ...eadablePretrainedPhi3TransformerModel.html | 8 +- ...eadablePretrainedQwenTransformerModel.html | 8 +- ...lePretrainedStarCoderTransformerModel.html | 8 +- .../ReadablePretrainedT5TransformerModel.html | 8 +- .../seq2seq/StarCoderTransformer$.html | 8 +- .../seq2seq/StarCoderTransformer.html | 8 +- .../annotators/seq2seq/T5Transformer$.html | 8 +- .../nlp/annotators/seq2seq/T5Transformer.html | 8 +- .../nlp/annotators/seq2seq/index.html | 8 +- .../DocumentSimilarityRankerApproach$.html | 8 +- .../DocumentSimilarityRankerApproach.html | 8 +- .../DocumentSimilarityRankerModel$.html | 8 +- .../DocumentSimilarityRankerModel.html | 8 +- .../similarity/DocumentSimilarityUtil$.html | 8 +- .../similarity/IndexedNeighbors.html | 8 +- .../IndexedNeighborsWithDistance.html | 8 +- .../similarity/NeighborAnnotation.html | 8 +- .../similarity/NeighborsResultSet.html | 8 +- .../ReadableDocumentSimilarityRanker.html | 8 +- .../nlp/annotators/similarity/index.html | 8 +- .../spell/context/CandidateStrategy$.html | 8 +- ...ntextSpellCheckerApproach$ArrayHelper.html | 8 +- .../context/ContextSpellCheckerApproach.html | 8 +- .../context/ContextSpellCheckerModel$.html | 8 +- .../ContextSpellCheckerModel$StringTools.html | 8 +- .../context/ContextSpellCheckerModel.html | 8 +- .../spell/context/HasTransducerFeatures.html | 8 +- .../spell/context/LangModelSentence.html | 8 +- .../ReadablePretrainedContextSpell.html | 8 +- .../context/ReadsLanguageModelGraph.html | 8 +- .../spell/context/WeightedLevenshtein.html | 8 +- .../nlp/annotators/spell/context/index.html | 8 +- .../spell/context/parser/AgeToken.html | 8 +- .../spell/context/parser/DateToken.html | 8 +- .../context/parser/GenericRegexParser.html | 8 +- .../context/parser/GenericVocabParser.html | 8 +- .../spell/context/parser/LocationClass.html | 8 +- .../spell/context/parser/MainVocab.html | 8 +- .../spell/context/parser/MedicationClass.html | 8 +- .../spell/context/parser/NamesClass.html | 8 +- .../spell/context/parser/NumberToken.html | 8 +- .../spell/context/parser/RegexParser.html | 8 +- .../context/parser/SerializableClass.html | 8 +- .../context/parser/SpecialClassParser.html | 8 +- .../context/parser/TransducerSeqFeature.html | 8 +- .../spell/context/parser/UnitToken.html | 8 +- .../spell/context/parser/VocabParser.html | 8 +- .../spell/context/parser/index.html | 8 +- .../nlp/annotators/spell/index.html | 8 +- .../spell/norvig/NorvigSweetingApproach$.html | 8 +- .../spell/norvig/NorvigSweetingApproach.html | 8 +- .../spell/norvig/NorvigSweetingModel$.html | 8 +- .../spell/norvig/NorvigSweetingModel.html | 8 +- .../spell/norvig/NorvigSweetingParams.html | 8 +- .../norvig/ReadablePretrainedNorvig.html | 8 +- .../nlp/annotators/spell/norvig/index.html | 8 +- .../ReadablePretrainedSymmetric.html | 8 +- .../symmetric/SymmetricDeleteApproach$.html | 8 +- .../symmetric/SymmetricDeleteApproach.html | 8 +- .../symmetric/SymmetricDeleteModel$.html | 8 +- .../SymmetricDeleteModel$SuggestedWord.html | 8 +- .../spell/symmetric/SymmetricDeleteModel.html | 8 +- .../symmetric/SymmetricDeleteParams.html | 8 +- .../nlp/annotators/spell/symmetric/index.html | 8 +- .../nlp/annotators/spell/util/Utilities$.html | 8 +- .../nlp/annotators/spell/util/index.html | 8 +- .../nlp/annotators/tapas/TapasCellDate$.html | 8 +- .../nlp/annotators/tapas/TapasCellDate.html | 8 +- .../nlp/annotators/tapas/TapasCellValue$.html | 8 +- .../nlp/annotators/tapas/TapasCellValue.html | 8 +- .../nlp/annotators/tapas/TapasEncoder.html | 8 +- .../nlp/annotators/tapas/TapasInputData.html | 8 +- .../tapas/TapasNumericRelation$.html | 8 +- .../tapas/TapasNumericValueSpan$.html | 8 +- .../tapas/TapasNumericValueSpan.html | 8 +- .../nlp/annotators/tapas/index.html | 8 +- .../tokenizer/bpe/BartTokenizer.html | 8 +- .../tokenizer/bpe/BpeTokenizer$.html | 8 +- .../tokenizer/bpe/CLIPTokenizer.html | 8 +- .../tokenizer/bpe/Gpt2Tokenizer.html | 8 +- .../tokenizer/bpe/LLAMA3Tokenizer.html | 8 +- .../tokenizer/bpe/Phi2Tokenizer.html | 8 +- .../tokenizer/bpe/QwenTokenizer.html | 8 +- .../tokenizer/bpe/RobertaTokenizer.html | 8 +- .../tokenizer/bpe/SpecialToken.html | 8 +- .../tokenizer/bpe/StarCoderTokenizer.html | 8 +- .../tokenizer/bpe/WhisperTokenDecoder.html | 8 +- .../nlp/annotators/tokenizer/bpe/index.html | 8 +- .../nlp/annotators/tokenizer/index.html | 8 +- .../ws/ReadablePretrainedWordSegmenter.html | 8 +- .../nlp/annotators/ws/TagsType$.html | 8 +- .../annotators/ws/WordSegmenterApproach$.html | 8 +- .../annotators/ws/WordSegmenterApproach.html | 8 +- .../annotators/ws/WordSegmenterModel$.html | 8 +- .../nlp/annotators/ws/WordSegmenterModel.html | 8 +- .../johnsnowlabs/nlp/annotators/ws/index.html | 8 +- .../nlp/embeddings/AlbertEmbeddings$.html | 8 +- .../nlp/embeddings/AlbertEmbeddings.html | 8 +- .../nlp/embeddings/BGEEmbeddings$.html | 8 +- .../nlp/embeddings/BGEEmbeddings.html | 8 +- .../nlp/embeddings/BertEmbeddings$.html | 8 +- .../nlp/embeddings/BertEmbeddings.html | 8 +- .../embeddings/BertSentenceEmbeddings$.html | 8 +- .../embeddings/BertSentenceEmbeddings.html | 8 +- .../nlp/embeddings/CamemBertEmbeddings$.html | 8 +- .../nlp/embeddings/CamemBertEmbeddings.html | 8 +- .../nlp/embeddings/ChunkEmbeddings$.html | 8 +- .../nlp/embeddings/ChunkEmbeddings.html | 8 +- .../nlp/embeddings/DeBertaEmbeddings$.html | 8 +- .../nlp/embeddings/DeBertaEmbeddings.html | 8 +- .../nlp/embeddings/DistilBertEmbeddings$.html | 8 +- .../nlp/embeddings/DistilBertEmbeddings.html | 8 +- .../nlp/embeddings/Doc2VecApproach$.html | 8 +- .../nlp/embeddings/Doc2VecApproach.html | 8 +- .../nlp/embeddings/Doc2VecModel$.html | 8 +- .../nlp/embeddings/Doc2VecModel.html | 8 +- .../nlp/embeddings/E5Embeddings$.html | 8 +- .../nlp/embeddings/E5Embeddings.html | 8 +- .../nlp/embeddings/ElmoEmbeddings$.html | 8 +- .../nlp/embeddings/ElmoEmbeddings.html | 8 +- .../EmbeddingsCoverage$CoverageResult.html | 8 +- .../nlp/embeddings/EmbeddingsCoverage.html | 8 +- .../embeddings/HasEmbeddingsProperties.html | 8 +- .../nlp/embeddings/InstructorEmbeddings$.html | 8 +- .../nlp/embeddings/InstructorEmbeddings.html | 8 +- .../nlp/embeddings/LongformerEmbeddings$.html | 8 +- .../nlp/embeddings/LongformerEmbeddings.html | 8 +- .../nlp/embeddings/MPNetEmbeddings$.html | 8 +- .../nlp/embeddings/MPNetEmbeddings.html | 8 +- .../nlp/embeddings/MxbaiEmbeddings$.html | 8 +- .../nlp/embeddings/MxbaiEmbeddings.html | 8 +- .../nlp/embeddings/NomicEmbeddings$.html | 8 +- .../nlp/embeddings/NomicEmbeddings.html | 8 +- .../PoolingStrategy$$AnnotatorType$.html | 8 +- .../nlp/embeddings/PoolingStrategy$.html | 8 +- .../nlp/embeddings/ReadAlbertDLModel.html | 8 +- .../nlp/embeddings/ReadBGEDLModel.html | 8 +- .../nlp/embeddings/ReadBertDLModel.html | 8 +- .../embeddings/ReadBertSentenceDLModel.html | 8 +- .../nlp/embeddings/ReadCamemBertDLModel.html | 8 +- .../nlp/embeddings/ReadDeBertaDLModel.html | 8 +- .../nlp/embeddings/ReadDistilBertDLModel.html | 8 +- .../nlp/embeddings/ReadE5DLModel.html | 8 +- .../nlp/embeddings/ReadElmoDLModel.html | 8 +- .../nlp/embeddings/ReadInstructorDLModel.html | 8 +- .../nlp/embeddings/ReadLongformerDLModel.html | 8 +- .../nlp/embeddings/ReadMPNetDLModel.html | 8 +- .../nlp/embeddings/ReadMxbaiDLModel.html | 8 +- .../ReadNomicEmbeddingsDLModel.html | 8 +- .../nlp/embeddings/ReadRobertaDLModel.html | 8 +- .../ReadRobertaSentenceDLModel.html | 8 +- .../nlp/embeddings/ReadSnowFlakeDLModel.html | 8 +- .../nlp/embeddings/ReadUAEDLModel.html | 8 +- .../nlp/embeddings/ReadUSEDLModel.html | 8 +- .../nlp/embeddings/ReadXlmRobertaDLModel.html | 8 +- .../ReadXlmRobertaSentenceDLModel.html | 8 +- .../nlp/embeddings/ReadXlnetDLModel.html | 8 +- .../ReadablePretrainedAlbertModel.html | 8 +- .../ReadablePretrainedBGEModel.html | 8 +- .../ReadablePretrainedBertModel.html | 8 +- .../ReadablePretrainedBertSentenceModel.html | 8 +- .../ReadablePretrainedCamemBertModel.html | 8 +- .../ReadablePretrainedDeBertaModel.html | 8 +- .../ReadablePretrainedDistilBertModel.html | 8 +- .../embeddings/ReadablePretrainedDoc2Vec.html | 8 +- .../embeddings/ReadablePretrainedE5Model.html | 8 +- .../ReadablePretrainedElmoModel.html | 8 +- .../ReadablePretrainedInstructorModel.html | 8 +- .../ReadablePretrainedLongformerModel.html | 8 +- .../ReadablePretrainedMPNetModel.html | 8 +- .../ReadablePretrainedMxbaiModel.html | 8 +- ...eadablePretrainedNomicEmbeddingsModel.html | 8 +- .../ReadablePretrainedRobertaModel.html | 8 +- ...eadablePretrainedRobertaSentenceModel.html | 8 +- .../ReadablePretrainedSnowFlakeModel.html | 8 +- .../ReadablePretrainedUAEModel.html | 8 +- .../ReadablePretrainedUSEModel.html | 8 +- .../ReadablePretrainedWord2Vec.html | 8 +- .../ReadablePretrainedWordEmbeddings.html | 8 +- .../ReadablePretrainedXlmRobertaModel.html | 8 +- ...ablePretrainedXlmRobertaSentenceModel.html | 8 +- .../ReadablePretrainedXlnetModel.html | 8 +- .../nlp/embeddings/ReadsFromBytes.html | 8 +- .../nlp/embeddings/RoBertaEmbeddings$.html | 8 +- .../nlp/embeddings/RoBertaEmbeddings.html | 8 +- .../RoBertaSentenceEmbeddings$.html | 8 +- .../embeddings/RoBertaSentenceEmbeddings.html | 8 +- .../nlp/embeddings/SentenceEmbeddings$.html | 8 +- .../nlp/embeddings/SentenceEmbeddings.html | 8 +- .../nlp/embeddings/SnowFlakeEmbeddings$.html | 8 +- .../nlp/embeddings/SnowFlakeEmbeddings.html | 8 +- .../nlp/embeddings/UAEEmbeddings$.html | 8 +- .../nlp/embeddings/UAEEmbeddings.html | 8 +- .../embeddings/UniversalSentenceEncoder$.html | 8 +- .../embeddings/UniversalSentenceEncoder.html | 8 +- .../nlp/embeddings/Word2VecApproach$.html | 8 +- .../nlp/embeddings/Word2VecApproach.html | 8 +- .../nlp/embeddings/Word2VecModel$.html | 8 +- .../nlp/embeddings/Word2VecModel.html | 8 +- .../nlp/embeddings/WordEmbeddings$.html | 8 +- .../nlp/embeddings/WordEmbeddings.html | 8 +- .../WordEmbeddingsBinaryIndexer$.html | 8 +- .../nlp/embeddings/WordEmbeddingsModel$.html | 8 +- .../nlp/embeddings/WordEmbeddingsModel.html | 8 +- .../nlp/embeddings/WordEmbeddingsReader.html | 8 +- .../WordEmbeddingsTextIndexer$.html | 8 +- .../nlp/embeddings/WordEmbeddingsWriter.html | 8 +- .../nlp/embeddings/XlmRoBertaEmbeddings$.html | 8 +- .../nlp/embeddings/XlmRoBertaEmbeddings.html | 8 +- .../XlmRoBertaSentenceEmbeddings$.html | 8 +- .../XlmRoBertaSentenceEmbeddings.html | 8 +- .../nlp/embeddings/XlnetEmbeddings$.html | 8 +- .../nlp/embeddings/XlnetEmbeddings.html | 8 +- .../johnsnowlabs/nlp/embeddings/index.html | 8 +- .../DocumentSimilarityRankerFinisher$.html | 8 +- .../DocumentSimilarityRankerFinisher.html | 8 +- .../com/johnsnowlabs/nlp/finisher/index.html | 8 +- .../nlp/functions$$EachAnnotations.html | 8 +- .../nlp/functions$$ExplodeAnnotations.html | 8 +- .../nlp/functions$$FilterAnnotations.html | 8 +- .../nlp/functions$$MapAnnotations.html | 8 +- docs/api/com/johnsnowlabs/nlp/functions$.html | 12 +- docs/api/com/johnsnowlabs/nlp/index.html | 119 +- .../nlp/pretrained/PretrainedPipeline$.html | 8 +- .../nlp/pretrained/PretrainedPipeline.html | 8 +- .../pretrained/PythonResourceDownloader$.html | 8 +- .../nlp/pretrained/RepositoryMetadata.html | 8 +- .../nlp/pretrained/ResourceDownloader$.html | 8 +- .../nlp/pretrained/ResourceDownloader.html | 8 +- .../nlp/pretrained/ResourceMetadata$.html | 8 +- .../nlp/pretrained/ResourceMetadata.html | 8 +- .../nlp/pretrained/ResourceRequest.html | 8 +- .../nlp/pretrained/ResourceType$.html | 8 +- .../nlp/pretrained/S3ResourceDownloader.html | 8 +- .../johnsnowlabs/nlp/pretrained/index.html | 8 +- .../com/johnsnowlabs/nlp/recursive/index.html | 8 +- .../nlp/recursive/package$$Recursive.html | 8 +- .../recursive/package$$RecursiveModel.html | 8 +- .../nlp/serialization/ArrayFeature.html | 8 +- .../nlp/serialization/Feature.html | 8 +- .../nlp/serialization/MapFeature.html | 8 +- .../SerializedExternalResource.html | 8 +- .../nlp/serialization/SetFeature.html | 8 +- .../nlp/serialization/StructFeature.html | 8 +- .../nlp/serialization/TransducerFeature.html | 8 +- .../johnsnowlabs/nlp/serialization/index.html | 8 +- .../com/johnsnowlabs/nlp/training/CoNLL.html | 8 +- .../nlp/training/CoNLL2003NerReader.html | 8 +- .../nlp/training/CoNLLDocument.html | 8 +- .../CoNLLHelper$$CoNLLSentenceCols.html | 8 +- .../training/CoNLLHelper$$CoNLLTokenCols.html | 8 +- .../nlp/training/CoNLLHelper$.html | 8 +- .../com/johnsnowlabs/nlp/training/CoNLLU.html | 8 +- .../nlp/training/CoNLLUCols$.html | 8 +- .../nlp/training/CoNLLUDocument.html | 8 +- .../com/johnsnowlabs/nlp/training/POS.html | 8 +- .../johnsnowlabs/nlp/training/PubTator.html | 8 +- .../nlp/training/SpacyToAnnotation.html | 8 +- .../com/johnsnowlabs/nlp/training/index.html | 8 +- .../johnsnowlabs/nlp/util/FinisherUtil$.html | 8 +- .../johnsnowlabs/nlp/util/GraphBuilder.html | 8 +- .../nlp/util/LfuCache$CachedItem.html | 8 +- .../nlp/util/LfuCache$DoubleLinked.html | 8 +- .../nlp/util/LfuCache$FrequencyList.html | 8 +- .../com/johnsnowlabs/nlp/util/LfuCache.html | 8 +- .../nlp/util/LruMap$KeyPriority.html | 8 +- .../nlp/util/LruMap$KeyPriorityOrdering$.html | 8 +- .../api/com/johnsnowlabs/nlp/util/LruMap.html | 8 +- .../nlp/util/SparkNlpConfig$.html | 8 +- docs/api/com/johnsnowlabs/nlp/util/index.html | 8 +- .../nlp/util/io/CloudStorageType$.html | 8 +- .../nlp/util/io/ExternalResource$.html | 8 +- .../nlp/util/io/ExternalResource.html | 8 +- .../nlp/util/io/MatchStrategy$.html | 8 +- .../nlp/util/io/OutputHelper$.html | 8 +- .../com/johnsnowlabs/nlp/util/io/ReadAs$.html | 8 +- .../util/io/ResourceHelper$$SourceStream.html | 8 +- .../nlp/util/io/ResourceHelper$.html | 8 +- .../com/johnsnowlabs/nlp/util/io/index.html | 8 +- .../nlp/util/regex/RegexRule.html | 8 +- .../util/regex/RuleFactory$$RuleMatch.html | 8 +- .../nlp/util/regex/RuleFactory$.html | 8 +- .../nlp/util/regex/RuleFactory.html | 8 +- .../nlp/util/regex/TransformStrategy$.html | 8 +- .../johnsnowlabs/nlp/util/regex/index.html | 8 +- .../com/johnsnowlabs/storage/BytesKey.html | 8 +- .../com/johnsnowlabs/storage/Database$.html | 8 +- .../com/johnsnowlabs/storage/Database.html | 8 +- .../johnsnowlabs/storage/HasConnection.html | 8 +- .../com/johnsnowlabs/storage/HasStorage.html | 8 +- .../johnsnowlabs/storage/HasStorageModel.html | 8 +- .../storage/HasStorageOptions.html | 8 +- .../storage/HasStorageReader.html | 8 +- .../johnsnowlabs/storage/HasStorageRef$.html | 8 +- .../johnsnowlabs/storage/HasStorageRef.html | 8 +- .../storage/RocksDBConnection$.html | 8 +- .../storage/RocksDBConnection.html | 8 +- .../storage/StorageBatchWriter.html | 8 +- .../johnsnowlabs/storage/StorageFormat.html | 8 +- .../johnsnowlabs/storage/StorageHelper$.html | 8 +- .../johnsnowlabs/storage/StorageLocator$.html | 8 +- .../johnsnowlabs/storage/StorageLocator.html | 8 +- .../storage/StorageReadWriter.html | 8 +- .../johnsnowlabs/storage/StorageReadable.html | 8 +- .../johnsnowlabs/storage/StorageReader.html | 8 +- .../johnsnowlabs/storage/StorageWriter.html | 8 +- docs/api/com/johnsnowlabs/storage/index.html | 8 +- .../api/com/johnsnowlabs/util/Benchmark$.html | 8 +- docs/api/com/johnsnowlabs/util/Build$.html | 8 +- .../johnsnowlabs/util/CoNLLGenerator$.html | 8 +- .../com/johnsnowlabs/util/ConfigHelper$.html | 8 +- .../com/johnsnowlabs/util/ConfigLoader$.html | 8 +- .../com/johnsnowlabs/util/FileHelper$.html | 8 +- .../com/johnsnowlabs/util/JsonBuilder$.html | 8 +- .../com/johnsnowlabs/util/JsonParser$.html | 8 +- .../johnsnowlabs/util/PipelineModels$.html | 8 +- .../johnsnowlabs/util/TrainingHelper$.html | 8 +- docs/api/com/johnsnowlabs/util/Version$.html | 8 +- docs/api/com/johnsnowlabs/util/Version.html | 8 +- .../johnsnowlabs/util/ZipArchiveUtil$.html | 8 +- docs/api/com/johnsnowlabs/util/index.html | 8 +- .../util/spark/LongMapAccumulator.html | 8 +- .../util/spark/MapAccumulator.html | 8 +- .../johnsnowlabs/util/spark/SparkUtil$.html | 8 +- .../com/johnsnowlabs/util/spark/index.html | 8 +- docs/api/index.html | 8 +- docs/api/index.js | 2 +- docs/api/python/.buildinfo | 2 +- docs/api/python/genindex.html | 40 +- docs/api/python/getting_started/index.html | 20 +- docs/api/python/index.html | 2 +- docs/api/python/modules/index.html | 4 +- docs/api/python/modules/sparknlp.html | 6 +- .../python/modules/sparknlp/annotation.html | 2 +- .../modules/sparknlp/annotation_audio.html | 2 +- .../modules/sparknlp/annotation_image.html | 2 +- .../annotator/audio/hubert_for_ctc.html | 2 +- .../annotator/audio/wav2vec2_for_ctc.html | 2 +- .../annotator/audio/whisper_for_ctc.html | 2 +- .../sparknlp/annotator/chunk2_doc.html | 2 +- .../modules/sparknlp/annotator/chunker.html | 2 +- .../albert_for_question_answering.html | 2 +- .../albert_for_sequence_classification.html | 2 +- .../albert_for_token_classification.html | 2 +- .../albert_for_zero_shot_classification.html | 2 +- .../bart_for_zero_shot_classification.html | 2 +- .../bert_for_multiple_choice.html | 577 +++ .../bert_for_question_answering.html | 2 +- .../bert_for_sequence_classification.html | 2 +- .../bert_for_token_classification.html | 2 +- .../bert_for_zero_shot_classification.html | 2 +- .../camembert_for_question_answering.html | 2 +- ...camembert_for_sequence_classification.html | 2 +- .../camembert_for_token_classification.html | 2 +- ...amembert_for_zero_shot_classification.html | 2 +- .../classifier_dl/classifier_dl.html | 2 +- .../deberta_for_question_answering.html | 2 +- .../deberta_for_sequence_classification.html | 2 +- .../deberta_for_token_classification.html | 2 +- .../deberta_for_zero_shot_classification.html | 2 +- .../distil_bert_for_question_answering.html | 2 +- ...stil_bert_for_sequence_classification.html | 2 +- .../distil_bert_for_token_classification.html | 2 +- ...til_bert_for_zero_shot_classification.html | 2 +- .../longformer_for_question_answering.html | 2 +- ...ongformer_for_sequence_classification.html | 2 +- .../longformer_for_token_classification.html | 2 +- .../mpnet_for_question_answering.html | 2 +- .../mpnet_for_sequence_classification.html | 2 +- .../mpnet_for_token_classification.html | 2 +- .../classifier_dl/multi_classifier_dl.html | 2 +- .../roberta_for_question_answering.html | 2 +- .../roberta_for_sequence_classification.html | 2 +- .../roberta_for_token_classification.html | 2 +- .../roberta_for_zero_shot_classification.html | 2 +- .../annotator/classifier_dl/sentiment_dl.html | 2 +- .../tapas_for_question_answering.html | 2 +- .../xlm_roberta_for_question_answering.html | 2 +- ...m_roberta_for_sequence_classification.html | 2 +- .../xlm_roberta_for_token_classification.html | 2 +- ..._roberta_for_zero_shot_classification.html | 2 +- .../xlnet_for_sequence_classification.html | 2 +- .../xlnet_for_token_classification.html | 2 +- .../annotator/coref/spanbert_coref.html | 2 +- .../cv/clip_for_zero_shot_classification.html | 2 +- .../cv/convnext_for_image_classification.html | 2 +- .../cv/swin_for_image_classification.html | 2 +- ..._encoder_decoder_for_image_captioning.html | 2 +- .../cv/vit_for_image_classification.html | 2 +- .../sparknlp/annotator/date2_chunk.html | 2 +- .../dependency/dependency_parser.html | 2 +- .../dependency/typed_dependency_parser.html | 2 +- .../document_character_text_splitter.html | 2 +- .../annotator/document_normalizer.html | 2 +- .../annotator/document_token_splitter.html | 2 +- .../document_token_splitter_test.html | 2 +- .../embeddings/albert_embeddings.html | 2 +- .../annotator/embeddings/bert_embeddings.html | 2 +- .../embeddings/bert_sentence_embeddings.html | 2 +- .../annotator/embeddings/bge_embeddings.html | 2 +- .../embeddings/camembert_embeddings.html | 2 +- .../embeddings/chunk_embeddings.html | 2 +- .../embeddings/deberta_embeddings.html | 2 +- .../embeddings/distil_bert_embeddings.html | 2 +- .../annotator/embeddings/doc2vec.html | 2 +- .../annotator/embeddings/e5_embeddings.html | 2 +- .../annotator/embeddings/elmo_embeddings.html | 2 +- .../embeddings/instructor_embeddings.html | 2 +- .../embeddings/longformer_embeddings.html | 2 +- .../embeddings/mpnet_embeddings.html | 2 +- .../embeddings/mxbai_embeddings.html | 2 +- .../embeddings/nomic_embeddings.html | 2 +- .../embeddings/roberta_embeddings.html | 2 +- .../roberta_sentence_embeddings.html | 2 +- .../embeddings/sentence_embeddings.html | 2 +- .../embeddings/snowflake_embeddings.html | 2 +- .../annotator/embeddings/uae_embeddings.html | 2 +- .../universal_sentence_encoder.html | 2 +- .../annotator/embeddings/word2vec.html | 2 +- .../annotator/embeddings/word_embeddings.html | 2 +- .../embeddings/xlm_roberta_embeddings.html | 2 +- .../xlm_roberta_sentence_embeddings.html | 2 +- .../embeddings/xlnet_embeddings.html | 2 +- .../sparknlp/annotator/er/entity_ruler.html | 2 +- .../sparknlp/annotator/graph_extraction.html | 2 +- .../yake_keyword_extraction.html | 2 +- .../annotator/ld_dl/language_detector_dl.html | 2 +- .../sparknlp/annotator/lemmatizer.html | 2 +- .../annotator/matcher/big_text_matcher.html | 2 +- .../annotator/matcher/date_matcher.html | 2 +- .../annotator/matcher/multi_date_matcher.html | 2 +- .../annotator/matcher/regex_matcher.html | 2 +- .../annotator/matcher/text_matcher.html | 2 +- .../sparknlp/annotator/n_gram_generator.html | 2 +- .../sparknlp/annotator/ner/ner_approach.html | 2 +- .../sparknlp/annotator/ner/ner_converter.html | 2 +- .../sparknlp/annotator/ner/ner_crf.html | 2 +- .../sparknlp/annotator/ner/ner_dl.html | 2 +- .../annotator/ner/ner_overwriter.html | 2 +- .../annotator/ner/zero_shot_ner_model.html | 2 +- .../sparknlp/annotator/normalizer.html | 2 +- .../annotator/openai/openai_completion.html | 2 +- .../annotator/openai/openai_embeddings.html | 2 +- .../annotator/param/classifier_encoder.html | 2 +- .../annotator/param/evaluation_dl_params.html | 2 +- .../sparknlp/annotator/pos/perceptron.html | 2 +- .../annotator/sentence/sentence_detector.html | 2 +- .../sentence/sentence_detector_dl.html | 2 +- .../sentiment/sentiment_detector.html | 2 +- .../annotator/sentiment/vivekn_sentiment.html | 2 +- .../annotator/seq2seq/auto_gguf_model.html | 2 +- .../annotator/seq2seq/bart_transformer.html | 2 +- .../annotator/seq2seq/cpm_transformer.html | 2 +- .../annotator/seq2seq/gpt2_transformer.html | 2 +- .../annotator/seq2seq/llama2_transformer.html | 2 +- .../annotator/seq2seq/llama3_transformer.html | 2 +- .../annotator/seq2seq/m2m100_transformer.html | 2 +- .../annotator/seq2seq/marian_transformer.html | 2 +- .../seq2seq/mistral_transformer.html | 2 +- .../annotator/seq2seq/nllb_transformer.html | 2 +- .../annotator/seq2seq/phi2_transformer.html | 2 +- .../annotator/seq2seq/phi3_transformer.html | 2 +- .../annotator/seq2seq/qwen_transformer.html | 2 +- .../seq2seq/starcoder_transformer.html | 2 +- .../annotator/seq2seq/t5_transformer.html | 2 +- .../document_similarity_ranker.html | 2 +- .../spell_check/context_spell_checker.html | 2 +- .../spell_check/norvig_sweeting.html | 2 +- .../spell_check/symmetric_delete.html | 2 +- .../modules/sparknlp/annotator/stemmer.html | 2 +- .../annotator/stop_words_cleaner.html | 2 +- .../annotator/tf_ner_dl_graph_builder.html | 2 +- .../annotator/token/chunk_tokenizer.html | 2 +- .../annotator/token/recursive_tokenizer.html | 2 +- .../annotator/token/regex_tokenizer.html | 2 +- .../sparknlp/annotator/token/tokenizer.html | 2 +- .../sparknlp/annotator/token2_chunk.html | 2 +- .../sparknlp/annotator/ws/word_segmenter.html | 2 +- .../sparknlp/base/audio_assembler.html | 2 +- .../modules/sparknlp/base/doc2_chunk.html | 2 +- .../sparknlp/base/document_assembler.html | 2 +- .../sparknlp/base/embeddings_finisher.html | 2 +- .../modules/sparknlp/base/finisher.html | 2 +- .../modules/sparknlp/base/graph_finisher.html | 2 +- .../sparknlp/base/has_recursive_fit.html | 2 +- .../base/has_recursive_transform.html | 2 +- .../sparknlp/base/image_assembler.html | 2 +- .../modules/sparknlp/base/light_pipeline.html | 2 +- .../base/multi_document_assembler.html | 2 +- .../sparknlp/base/prompt_assembler.html | 623 ++++ .../sparknlp/base/recursive_pipeline.html | 2 +- .../sparknlp/base/table_assembler.html | 2 +- .../sparknlp/base/token_assembler.html | 2 +- .../sparknlp/common/annotator_approach.html | 2 +- .../sparknlp/common/annotator_model.html | 2 +- .../sparknlp/common/annotator_properties.html | 2 +- .../sparknlp/common/match_strategy.html | 2 +- .../modules/sparknlp/common/properties.html | 2 +- .../modules/sparknlp/common/read_as.html | 2 +- .../common/recursive_annotator_approach.html | 2 +- .../python/modules/sparknlp/common/utils.html | 2 +- .../python/modules/sparknlp/functions.html | 2 +- .../sparknlp/internal/annotator_java_ml.html | 2 +- .../internal/annotator_transformer.html | 2 +- .../internal/extended_java_wrapper.html | 2 +- .../internal/params_getters_setters.html | 2 +- .../modules/sparknlp/internal/recursive.html | 2 +- .../modules/sparknlp/logging/comet.html | 2 +- .../pretrained/pretrained_pipeline.html | 2 +- .../pretrained/resource_downloader.html | 2 +- .../modules/sparknlp/training/conll.html | 2 +- .../modules/sparknlp/training/conllu.html | 2 +- .../python/modules/sparknlp/training/pos.html | 2 +- .../modules/sparknlp/training/pub_tator.html | 2 +- .../training/spacy_to_annotation.html | 2 +- docs/api/python/objects.inv | Bin 15864 -> 16030 bytes docs/api/python/py-modindex.html | 12 +- .../sparknlp/annotation/index.html | 2 +- .../sparknlp/annotation_audio/index.html | 2 +- .../sparknlp/annotation_image/index.html | 2 +- .../annotator/audio/hubert_for_ctc/index.html | 2 +- .../sparknlp/annotator/audio/index.html | 2 +- .../audio/wav2vec2_for_ctc/index.html | 2 +- .../audio/whisper_for_ctc/index.html | 2 +- .../sparknlp/annotator/chunk2_doc/index.html | 2 +- .../sparknlp/annotator/chunker/index.html | 2 +- .../albert_for_question_answering/index.html | 2 +- .../index.html | 2 +- .../index.html | 2 +- .../index.html | 2 +- .../index.html | 2 +- .../bert_for_multiple_choice/index.html | 596 +++ .../bert_for_question_answering/index.html | 2 +- .../index.html | 2 +- .../bert_for_token_classification/index.html | 2 +- .../index.html | 2 +- .../index.html | 2 +- .../index.html | 2 +- .../index.html | 2 +- .../index.html | 2 +- .../classifier_dl/classifier_dl/index.html | 2 +- .../deberta_for_question_answering/index.html | 2 +- .../index.html | 2 +- .../index.html | 2 +- .../index.html | 2 +- .../index.html | 2 +- .../index.html | 2 +- .../index.html | 2 +- .../index.html | 2 +- .../annotator/classifier_dl/index.html | 2 +- .../index.html | 2 +- .../index.html | 2 +- .../index.html | 2 +- .../mpnet_for_question_answering/index.html | 2 +- .../index.html | 2 +- .../mpnet_for_token_classification/index.html | 2 +- .../multi_classifier_dl/index.html | 2 +- .../roberta_for_question_answering/index.html | 2 +- .../index.html | 2 +- .../index.html | 2 +- .../index.html | 2 +- .../classifier_dl/sentiment_dl/index.html | 2 +- .../tapas_for_question_answering/index.html | 2 +- .../index.html | 2 +- .../index.html | 2 +- .../index.html | 2 +- .../index.html | 2 +- .../index.html | 2 +- .../xlnet_for_token_classification/index.html | 2 +- .../sparknlp/annotator/coref/index.html | 2 +- .../annotator/coref/spanbert_coref/index.html | 2 +- .../index.html | 2 +- .../index.html | 2 +- .../sparknlp/annotator/cv/index.html | 2 +- .../swin_for_image_classification/index.html | 2 +- .../index.html | 2 +- .../vit_for_image_classification/index.html | 2 +- .../sparknlp/annotator/date2_chunk/index.html | 2 +- .../dependency/dependency_parser/index.html | 2 +- .../sparknlp/annotator/dependency/index.html | 2 +- .../typed_dependency_parser/index.html | 2 +- .../index.html | 2 +- .../annotator/document_normalizer/index.html | 2 +- .../document_token_splitter/index.html | 2 +- .../document_token_splitter_test/index.html | 2 +- .../embeddings/albert_embeddings/index.html | 2 +- .../embeddings/bert_embeddings/index.html | 2 +- .../bert_sentence_embeddings/index.html | 2 +- .../embeddings/bge_embeddings/index.html | 2 +- .../camembert_embeddings/index.html | 2 +- .../embeddings/chunk_embeddings/index.html | 2 +- .../embeddings/deberta_embeddings/index.html | 2 +- .../distil_bert_embeddings/index.html | 2 +- .../annotator/embeddings/doc2vec/index.html | 2 +- .../embeddings/e5_embeddings/index.html | 2 +- .../embeddings/elmo_embeddings/index.html | 2 +- .../sparknlp/annotator/embeddings/index.html | 2 +- .../instructor_embeddings/index.html | 2 +- .../longformer_embeddings/index.html | 2 +- .../embeddings/mpnet_embeddings/index.html | 2 +- .../embeddings/mxbai_embeddings/index.html | 2 +- .../embeddings/nomic_embeddings/index.html | 2 +- .../embeddings/roberta_embeddings/index.html | 2 +- .../roberta_sentence_embeddings/index.html | 2 +- .../embeddings/sentence_embeddings/index.html | 2 +- .../snowflake_embeddings/index.html | 2 +- .../embeddings/uae_embeddings/index.html | 2 +- .../universal_sentence_encoder/index.html | 2 +- .../annotator/embeddings/word2vec/index.html | 2 +- .../embeddings/word_embeddings/index.html | 2 +- .../xlm_roberta_embeddings/index.html | 2 +- .../index.html | 2 +- .../embeddings/xlnet_embeddings/index.html | 2 +- .../annotator/er/entity_ruler/index.html | 2 +- .../sparknlp/annotator/er/index.html | 2 +- .../annotator/graph_extraction/index.html | 2 +- .../autosummary/sparknlp/annotator/index.html | 2 +- .../annotator/keyword_extraction/index.html | 2 +- .../yake_keyword_extraction/index.html | 2 +- .../sparknlp/annotator/ld_dl/index.html | 2 +- .../ld_dl/language_detector_dl/index.html | 2 +- .../sparknlp/annotator/lemmatizer/index.html | 2 +- .../matcher/big_text_matcher/index.html | 2 +- .../annotator/matcher/date_matcher/index.html | 2 +- .../sparknlp/annotator/matcher/index.html | 2 +- .../matcher/multi_date_matcher/index.html | 2 +- .../matcher/regex_matcher/index.html | 2 +- .../annotator/matcher/text_matcher/index.html | 2 +- .../annotator/n_gram_generator/index.html | 2 +- .../sparknlp/annotator/ner/index.html | 2 +- .../annotator/ner/ner_approach/index.html | 2 +- .../annotator/ner/ner_converter/index.html | 2 +- .../sparknlp/annotator/ner/ner_crf/index.html | 2 +- .../sparknlp/annotator/ner/ner_dl/index.html | 2 +- .../annotator/ner/ner_overwriter/index.html | 2 +- .../ner/zero_shot_ner_model/index.html | 2 +- .../sparknlp/annotator/normalizer/index.html | 2 +- .../sparknlp/annotator/openai/index.html | 2 +- .../openai/openai_completion/index.html | 2 +- .../openai/openai_embeddings/index.html | 2 +- .../param/classifier_encoder/index.html | 2 +- .../param/evaluation_dl_params/index.html | 2 +- .../sparknlp/annotator/param/index.html | 2 +- .../sparknlp/annotator/pos/index.html | 2 +- .../annotator/pos/perceptron/index.html | 2 +- .../sparknlp/annotator/sentence/index.html | 2 +- .../sentence/sentence_detector/index.html | 2 +- .../sentence/sentence_detector_dl/index.html | 2 +- .../sparknlp/annotator/sentiment/index.html | 2 +- .../sentiment/sentiment_detector/index.html | 2 +- .../sentiment/vivekn_sentiment/index.html | 2 +- .../seq2seq/auto_gguf_model/index.html | 2 +- .../seq2seq/bart_transformer/index.html | 2 +- .../seq2seq/cpm_transformer/index.html | 2 +- .../seq2seq/gpt2_transformer/index.html | 2 +- .../sparknlp/annotator/seq2seq/index.html | 2 +- .../seq2seq/llama2_transformer/index.html | 2 +- .../seq2seq/llama3_transformer/index.html | 2 +- .../seq2seq/m2m100_transformer/index.html | 2 +- .../seq2seq/marian_transformer/index.html | 2 +- .../seq2seq/mistral_transformer/index.html | 2 +- .../seq2seq/nllb_transformer/index.html | 2 +- .../seq2seq/phi2_transformer/index.html | 2 +- .../seq2seq/phi3_transformer/index.html | 2 +- .../seq2seq/qwen_transformer/index.html | 2 +- .../seq2seq/starcoder_transformer/index.html | 2 +- .../seq2seq/t5_transformer/index.html | 2 +- .../document_similarity_ranker/index.html | 2 +- .../sparknlp/annotator/similarity/index.html | 2 +- .../context_spell_checker/index.html | 2 +- .../sparknlp/annotator/spell_check/index.html | 2 +- .../spell_check/norvig_sweeting/index.html | 2 +- .../spell_check/symmetric_delete/index.html | 2 +- .../sparknlp/annotator/stemmer/index.html | 2 +- .../annotator/stop_words_cleaner/index.html | 2 +- .../tf_ner_dl_graph_builder/index.html | 2 +- .../token/chunk_tokenizer/index.html | 2 +- .../sparknlp/annotator/token/index.html | 2 +- .../token/recursive_tokenizer/index.html | 2 +- .../token/regex_tokenizer/index.html | 2 +- .../annotator/token/tokenizer/index.html | 2 +- .../annotator/token2_chunk/index.html | 2 +- .../sparknlp/annotator/ws/index.html | 2 +- .../annotator/ws/word_segmenter/index.html | 2 +- .../sparknlp/base/audio_assembler/index.html | 3 +- .../sparknlp/base/doc2_chunk/index.html | 3 +- .../base/document_assembler/index.html | 3 +- .../base/embeddings_finisher/index.html | 3 +- .../sparknlp/base/finisher/index.html | 3 +- .../sparknlp/base/graph_finisher/index.html | 3 +- .../base/has_recursive_fit/index.html | 3 +- .../base/has_recursive_transform/index.html | 3 +- .../sparknlp/base/image_assembler/index.html | 3 +- .../autosummary/sparknlp/base/index.html | 4 +- .../sparknlp/base/light_pipeline/index.html | 3 +- .../base/multi_document_assembler/index.html | 2 +- .../sparknlp/base/prompt_assembler/index.html | 601 +++ .../base/recursive_pipeline/index.html | 3 +- .../sparknlp/base/table_assembler/index.html | 3 +- .../sparknlp/base/token_assembler/index.html | 3 +- .../common/annotator_approach/index.html | 2 +- .../common/annotator_model/index.html | 2 +- .../common/annotator_properties/index.html | 2 +- .../sparknlp/common/annotator_type/index.html | 2 +- .../common/coverage_result/index.html | 2 +- .../autosummary/sparknlp/common/index.html | 2 +- .../sparknlp/common/match_strategy/index.html | 2 +- .../sparknlp/common/properties/index.html | 2 +- .../sparknlp/common/read_as/index.html | 2 +- .../recursive_annotator_approach/index.html | 2 +- .../sparknlp/common/storage/index.html | 2 +- .../sparknlp/common/utils/index.html | 2 +- .../autosummary/sparknlp/functions/index.html | 2 +- .../reference/autosummary/sparknlp/index.html | 3 +- .../internal/annotator_java_ml/index.html | 2 +- .../internal/annotator_transformer/index.html | 2 +- .../internal/extended_java_wrapper/index.html | 2 +- .../autosummary/sparknlp/internal/index.html | 2 +- .../params_getters_setters/index.html | 2 +- .../sparknlp/internal/recursive/index.html | 2 +- .../sparknlp/logging/comet/index.html | 2 +- .../autosummary/sparknlp/logging/index.html | 2 +- .../sparknlp/pretrained/index.html | 2 +- .../pretrained/pretrained_pipeline/index.html | 2 +- .../pretrained/resource_downloader/index.html | 2 +- .../sparknlp/pretrained/utils/index.html | 2 +- .../sparknlp/training/conll/index.html | 2 +- .../sparknlp/training/conllu/index.html | 2 +- .../autosummary/sparknlp/training/index.html | 2 +- .../sparknlp/training/pos/index.html | 2 +- .../sparknlp/training/pub_tator/index.html | 2 +- .../training/spacy_to_annotation/index.html | 2 +- .../sparknlp/training/tfgraphs/index.html | 2 +- .../sparknlp/upload_to_hub/index.html | 2 +- .../autosummary/sparknlp/util/index.html | 2 +- docs/api/python/reference/index.html | 2 +- docs/api/python/search.html | 2 +- docs/api/python/searchindex.js | 2 +- .../python/static/documentation_options.js | 2 +- docs/api/python/third_party/Comet.html | 2 +- docs/api/python/third_party/MLflow.html | 2 +- docs/api/python/third_party/index.html | 2 +- docs/api/python/user_guide/annotation.html | 2 +- docs/api/python/user_guide/annotators.html | 2 +- .../python/user_guide/custom_pipelines.html | 2 +- docs/api/python/user_guide/helpers.html | 2 +- docs/api/python/user_guide/index.html | 2 +- .../python/user_guide/light_pipelines.html | 2 +- .../user_guide/pretrained_pipelines.html | 2 +- docs/api/python/user_guide/training.html | 2 +- docs/api/scala/collection/compat/index.html | 8 +- docs/api/scala/collection/index.html | 8 +- docs/api/scala/index.html | 8 +- .../classifier/dl/BertForMultipleChoice.scala | 150 +- .../annotators/seq2seq/AutoGGUFModel.scala | 2 +- .../dl/BertForMultipleChoiceTestSpec.scala | 11 +- 1631 files changed, 21418 insertions(+), 5317 deletions(-) create mode 100644 docs/api/com/johnsnowlabs/ml/util/LlamaCPP$.html create mode 100644 docs/api/com/johnsnowlabs/nlp/PromptAssembler$.html create mode 100644 docs/api/com/johnsnowlabs/nlp/PromptAssembler.html create mode 100644 docs/api/com/johnsnowlabs/nlp/annotators/classifier/dl/BertForMultipleChoice$.html create mode 100644 docs/api/com/johnsnowlabs/nlp/annotators/classifier/dl/BertForMultipleChoice.html create mode 100644 docs/api/com/johnsnowlabs/nlp/annotators/classifier/dl/ReadBertForMultipleChoiceModel.html create mode 100644 docs/api/com/johnsnowlabs/nlp/annotators/classifier/dl/ReadablePretrainedBertForMultipleChoiceModel.html create mode 100644 docs/api/python/modules/sparknlp/annotator/classifier_dl/bert_for_multiple_choice.html create mode 100644 docs/api/python/modules/sparknlp/base/prompt_assembler.html create mode 100644 docs/api/python/reference/autosummary/sparknlp/annotator/classifier_dl/bert_for_multiple_choice/index.html create mode 100644 docs/api/python/reference/autosummary/sparknlp/base/prompt_assembler/index.html diff --git a/docs/api/com/index.html b/docs/api/com/index.html index 770f6f64dbc4fc..771e3d19586f9b 100644 --- a/docs/api/com/index.html +++ b/docs/api/com/index.html @@ -3,9 +3,9 @@ - Spark NLP 5.5.0 ScalaDoc - com - - + Spark NLP 5.5.1 ScalaDoc - com + + @@ -28,7 +28,7 @@