From 644856410c11071dcd65e91aae5e737a2e3cb5db Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Fri, 29 Nov 2024 22:23:19 +0500 Subject: [PATCH 1/4] Default name updates --- .../sparknlp/annotator/embeddings/nomic_embeddings.py | 6 +++--- python/sparknlp/annotator/seq2seq/cpm_transformer.py | 10 +++++----- python/sparknlp/annotator/seq2seq/nllb_transformer.py | 8 ++++---- python/sparknlp/annotator/seq2seq/phi3_transformer.py | 8 ++++---- python/sparknlp/annotator/seq2seq/qwen_transformer.py | 6 +++--- .../nlp/annotators/seq2seq/CPMTransformer.scala | 7 ++++--- .../nlp/annotators/seq2seq/NLLBTransformer.scala | 6 +++--- .../nlp/annotators/seq2seq/Phi3Transformer.scala | 6 +++--- .../nlp/annotators/seq2seq/QwenTransformer.scala | 6 +++--- .../johnsnowlabs/nlp/embeddings/NomicEmbeddings.scala | 6 +++--- 10 files changed, 35 insertions(+), 34 deletions(-) diff --git a/python/sparknlp/annotator/embeddings/nomic_embeddings.py b/python/sparknlp/annotator/embeddings/nomic_embeddings.py index 430418ae8fc272..b80597cac937d1 100644 --- a/python/sparknlp/annotator/embeddings/nomic_embeddings.py +++ b/python/sparknlp/annotator/embeddings/nomic_embeddings.py @@ -31,7 +31,7 @@ class NomicEmbeddings(AnnotatorModel, HasEmbeddingsProperties, HasCaseSensitiveP ... .setOutputCol("nomic_embeddings") - The default model is ``"nomic_small"``, if no name is provided. + The default model is ``"nomic_embed_v1"``, if no name is provided. For available pretrained models please see the `Models Hub `__. @@ -159,13 +159,13 @@ def loadSavedModel(folder, spark_session, use_openvino=False): return NomicEmbeddings(java_model=jModel) @staticmethod - def pretrained(name="nomic_small", lang="en", remote_loc=None): + def pretrained(name="nomic_embed_v1", lang="en", remote_loc=None): """Downloads and loads a pretrained model. Parameters ---------- name : str, optional - Name of the pretrained model, by default "nomic_small" + Name of the pretrained model, by default "nomic_embed_v1" lang : str, optional Language of the pretrained model, by default "en" remote_loc : str, optional diff --git a/python/sparknlp/annotator/seq2seq/cpm_transformer.py b/python/sparknlp/annotator/seq2seq/cpm_transformer.py index ad669815b2c68c..7da0e216686fd6 100644 --- a/python/sparknlp/annotator/seq2seq/cpm_transformer.py +++ b/python/sparknlp/annotator/seq2seq/cpm_transformer.py @@ -44,7 +44,7 @@ class CPMTransformer(AnnotatorModel, HasBatchedAnnotate, HasEngine): ... .setOutputCol("generation") - The default model is ``"llam2-7b"``, if no name is provided. For available + The default model is ``"mini_cpm_2b_8bit"``, if no name is provided. For available pretrained models please see the `Models Hub `__. @@ -104,7 +104,7 @@ class CPMTransformer(AnnotatorModel, HasBatchedAnnotate, HasEngine): >>> documentAssembler = DocumentAssembler() \\ ... .setInputCol("text") \\ ... .setOutputCol("documents") - >>> cpm = CPMTransformer.pretrained("llama_2_7b_chat_hf_int4") \\ + >>> cpm = CPMTransformer.pretrained("mini_cpm_2b_8bit","xx") \\ ... .setInputCols(["documents"]) \\ ... .setMaxOutputLength(50) \\ ... .setOutputCol("generation") @@ -299,15 +299,15 @@ def loadSavedModel(folder, spark_session, use_openvino = False): return CPMTransformer(java_model=jModel) @staticmethod - def pretrained(name="llama_2_7b_chat_hf_int4", lang="en", remote_loc=None): + def pretrained(name="mini_cpm_2b_8bit", lang="xx", remote_loc=None): """Downloads and loads a pretrained model. Parameters ---------- name : str, optional - Name of the pretrained model, by default "llama_2_7b_chat_hf_int4" + Name of the pretrained model, by default "mini_cpm_2b_8bit" lang : str, optional - Language of the pretrained model, by default "en" + Language of the pretrained model, by default "xx" remote_loc : str, optional Optional remote address of the resource, by default None. Will use Spark NLPs repositories otherwise. diff --git a/python/sparknlp/annotator/seq2seq/nllb_transformer.py b/python/sparknlp/annotator/seq2seq/nllb_transformer.py index 443fc324c0fa12..290222150912e5 100644 --- a/python/sparknlp/annotator/seq2seq/nllb_transformer.py +++ b/python/sparknlp/annotator/seq2seq/nllb_transformer.py @@ -32,7 +32,7 @@ class NLLBTransformer(AnnotatorModel, HasBatchedAnnotate, HasEngine): ... .setOutputCol("generation") - The default model is ``"nllb_418M"``, if no name is provided. For available + The default model is ``"nllb_distilled_600M_8int"``, if no name is provided. For available pretrained models please see the `Models Hub `__. @@ -164,7 +164,7 @@ class NLLBTransformer(AnnotatorModel, HasBatchedAnnotate, HasEngine): >>> documentAssembler = DocumentAssembler() \\ ... .setInputCol("text") \\ ... .setOutputCol("documents") - >>> nllb = NLLBTransformer.pretrained("nllb_418M") \\ + >>> nllb = NLLBTransformer.pretrained("nllb_distilled_600M_8int") \\ ... .setInputCols(["documents"]) \\ ... .setMaxOutputLength(50) \\ ... .setOutputCol("generation") \\ @@ -398,13 +398,13 @@ def loadSavedModel(folder, spark_session, use_openvino=False): return NLLBTransformer(java_model=jModel) @staticmethod - def pretrained(name="nllb_418M", lang="xx", remote_loc=None): + def pretrained(name="nllb_distilled_600M_8int", lang="xx", remote_loc=None): """Downloads and loads a pretrained model. Parameters ---------- name : str, optional - Name of the pretrained model, by default "nllb_418M" + Name of the pretrained model, by default "nllb_distilled_600M_8int" lang : str, optional Language of the pretrained model, by default "en" remote_loc : str, optional diff --git a/python/sparknlp/annotator/seq2seq/phi3_transformer.py b/python/sparknlp/annotator/seq2seq/phi3_transformer.py index 4dcd8135942491..56fde4220ff90b 100644 --- a/python/sparknlp/annotator/seq2seq/phi3_transformer.py +++ b/python/sparknlp/annotator/seq2seq/phi3_transformer.py @@ -37,7 +37,7 @@ class Phi3Transformer(AnnotatorModel, HasBatchedAnnotate, HasEngine): ... .setOutputCol("generation") - The default model is ``"phi3"``, if no name is provided. For available + The default model is ``phi_3_mini_128k_instruct``, if no name is provided. For available pretrained models please see the `Models Hub `__. @@ -112,7 +112,7 @@ class Phi3Transformer(AnnotatorModel, HasBatchedAnnotate, HasEngine): >>> documentAssembler = DocumentAssembler() \\ ... .setInputCol("text") \\ ... .setOutputCol("documents") - >>> phi3 = Phi3Transformer.pretrained("phi3") \\ + >>> phi3 = Phi3Transformer.pretrained(phi_3_mini_128k_instruct) \\ ... .setInputCols(["documents"]) \\ ... .setMaxOutputLength(50) \\ ... .setOutputCol("generation") @@ -308,13 +308,13 @@ def loadSavedModel(folder, spark_session, use_openvino=False): return Phi3Transformer(java_model=jModel) @staticmethod - def pretrained(name="phi3", lang="en", remote_loc=None): + def pretrained(name=phi_3_mini_128k_instruct, lang="en", remote_loc=None): """Downloads and loads a pretrained model. Parameters ---------- name : str, optional - Name of the pretrained model, by default "phi3" + Name of the pretrained model, by default phi_3_mini_128k_instruct lang : str, optional Language of the pretrained model, by default "en" remote_loc : str, optional diff --git a/python/sparknlp/annotator/seq2seq/qwen_transformer.py b/python/sparknlp/annotator/seq2seq/qwen_transformer.py index 27ece0e914dde1..64b5c19c573a8b 100644 --- a/python/sparknlp/annotator/seq2seq/qwen_transformer.py +++ b/python/sparknlp/annotator/seq2seq/qwen_transformer.py @@ -121,7 +121,7 @@ class QwenTransformer(AnnotatorModel, HasBatchedAnnotate, HasEngine): >>> documentAssembler = DocumentAssembler() \\ ... .setInputCol("text") \\ ... .setOutputCol("documents") - >>> qwen = QwenTransformer.pretrained("qwen-7b") \\ + >>> qwen = QwenTransformer.pretrained("qwen_7.5b_chat") \\ ... .setInputCols(["documents"]) \\ ... .setMaxOutputLength(50) \\ ... .setOutputCol("generation") @@ -317,13 +317,13 @@ def loadSavedModel(folder, spark_session, use_openvino=False): return QwenTransformer(java_model=jModel) @staticmethod - def pretrained(name="qwen-7b", lang="en", remote_loc=None): + def pretrained(name="qwen_7.5b_chat", lang="en", remote_loc=None): """Downloads and loads a pretrained model. Parameters ---------- name : str, optional - Name of the pretrained model, by default "qwen-7b" + Name of the pretrained model, by default "qwen_7.5b_chat" lang : str, optional Language of the pretrained model, by default "en" remote_loc : str, optional diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/CPMTransformer.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/CPMTransformer.scala index 4ba30b7c0129b4..f458ac93b2906d 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/CPMTransformer.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/CPMTransformer.scala @@ -68,7 +68,7 @@ import org.json4s.jackson.JsonMethods._ * .setInputCols("document") * .setOutputCol("generation") * }}} - * The default model is `"llama_2_7b_chat_hf_int4"`, if no name is provided. For available + * The default model is `"mini_cpm_2b_8bit"`, if no name is provided. For available * pretrained models please see the [[https://sparknlp.org/models?q=cpm Models Hub]]. * * For extended examples of usage, see @@ -94,7 +94,7 @@ import org.json4s.jackson.JsonMethods._ * .setInputCol("text") * .setOutputCol("documents") * - * val cpm = CPMTransformer.pretrained("llama_2_7b_chat_hf_int4") + * val cpm = CPMTransformer.pretrained("mini_cpm_2b_8bit") * .setInputCols(Array("documents")) * .setMinOutputLength(10) * .setMaxOutputLength(50) @@ -311,7 +311,8 @@ class CPMTransformer(override val uid: String) trait ReadablePretrainedCPMTransformerModel extends ParamsAndFeaturesReadable[CPMTransformer] with HasPretrained[CPMTransformer] { - override val defaultModelName: Some[String] = Some("llama_2_7b_chat_hf_int4") + override val defaultModelName: Some[String] = Some("mini_cpm_2b_8bit") + override val defaultLang: String = "xx" /** Java compliant-overrides */ override def pretrained(): CPMTransformer = super.pretrained() diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/NLLBTransformer.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/NLLBTransformer.scala index 8f35a6937d587c..cc6fb853c66028 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/NLLBTransformer.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/NLLBTransformer.scala @@ -59,7 +59,7 @@ import org.json4s.jackson.JsonMethods._ * .setInputCols("document") * .setOutputCol("generation") * }}} - * The default model is `"nllb_418M"`, if no name is provided. For available pretrained models + * The default model is `"nllb_distilled_600M_8int"`, if no name is provided. For available pretrained models * please see the [[https://sparknlp.org/models?q=nllb Models Hub]]. * * For extended examples of usage, see @@ -156,7 +156,7 @@ import org.json4s.jackson.JsonMethods._ * .setInputCol("text") * .setOutputCol("documents") * - * val nllb = NLLBTransformer.pretrained("nllb_418M") + * val nllb = NLLBTransformer.pretrained("nllb_distilled_600M_8int") * .setInputCols(Array("documents")) * .setSrcLang("zho_Hans") * .serTgtLang("eng_Latn") @@ -635,7 +635,7 @@ class NLLBTransformer(override val uid: String) trait ReadablePretrainedNLLBTransformerModel extends ParamsAndFeaturesReadable[NLLBTransformer] with HasPretrained[NLLBTransformer] { - override val defaultModelName: Some[String] = Some("nllb_418M") + override val defaultModelName: Some[String] = Some("nllb_distilled_600M_8int") override val defaultLang: String = "xx" /** Java compliant-overrides */ diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/Phi3Transformer.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/Phi3Transformer.scala index e983a4f075553d..d65eac7757e5a5 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/Phi3Transformer.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/Phi3Transformer.scala @@ -65,7 +65,7 @@ import org.json4s.jackson.JsonMethods._ * .setInputCols("document") * .setOutputCol("generation") * }}} - * The default model is `"phi_3_mini_128k_instruct_int8"`, if no name is provided. For available + * The default model is `"phi_3_mini_128k_instruct"`, if no name is provided. For available * pretrained models please see the [[https://sparknlp.org/models?q=phi3 Models Hub]]. * * For extended examples of usage, see @@ -106,7 +106,7 @@ import org.json4s.jackson.JsonMethods._ * .setInputCol("text") * .setOutputCol("documents") * - * val phi3 = Phi3Transformer.pretrained("phi_3_mini_128k_instruct_int8") + * val phi3 = Phi3Transformer.pretrained("phi_3_mini_128k_instruct") * .setInputCols(Array("documents")) * .setMinOutputLength(10) * .setMaxOutputLength(50) @@ -323,7 +323,7 @@ class Phi3Transformer(override val uid: String) trait ReadablePretrainedPhi3TransformerModel extends ParamsAndFeaturesReadable[Phi3Transformer] with HasPretrained[Phi3Transformer] { - override val defaultModelName: Some[String] = Some("phi_3_mini_128k_instruct_int8") + override val defaultModelName: Some[String] = Some("phi_3_mini_128k_instruct") /** Java compliant-overrides */ override def pretrained(): Phi3Transformer = super.pretrained() diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/QwenTransformer.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/QwenTransformer.scala index 9fd834a577cb47..9811607afbf8f9 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/QwenTransformer.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/QwenTransformer.scala @@ -68,7 +68,7 @@ import org.json4s.jackson.JsonMethods._ * .setInputCols("document") * .setOutputCol("generation") * }}} - * The default model is `"Qwen-13b"`, if no name is provided. For available pretrained models + * The default model is `"qwen_7.5b_chat"`, if no name is provided. For available pretrained models * please see the [[https://sparknlp.org/models?q=Qwen Models Hub]]. * * For extended examples of usage, see @@ -113,7 +113,7 @@ import org.json4s.jackson.JsonMethods._ * .setInputCol("text") * .setOutputCol("documents") * - * val Qwen = QwenTransformer.pretrained("Qwen-7b") + * val Qwen = QwenTransformer.pretrained("qwen_7.5b_chat") * .setInputCols(Array("documents")) * .setMinOutputLength(10) * .setMaxOutputLength(50) @@ -334,7 +334,7 @@ class QwenTransformer(override val uid: String) trait ReadablePretrainedQwenTransformerModel extends ParamsAndFeaturesReadable[QwenTransformer] with HasPretrained[QwenTransformer] { - override val defaultModelName: Some[String] = Some("Qwen-7b") + override val defaultModelName: Some[String] = Some("qwen_7.5b_chat") /** Java compliant-overrides */ override def pretrained(): QwenTransformer = super.pretrained() diff --git a/src/main/scala/com/johnsnowlabs/nlp/embeddings/NomicEmbeddings.scala b/src/main/scala/com/johnsnowlabs/nlp/embeddings/NomicEmbeddings.scala index a53f3eaad60b6a..f6717d78fd2b5a 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/embeddings/NomicEmbeddings.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/embeddings/NomicEmbeddings.scala @@ -49,7 +49,7 @@ import com.johnsnowlabs.ml.openvino.{OpenvinoWrapper, ReadOpenvinoModel, WriteOp * .setInputCols("document") * .setOutputCol("nomic_embeddings") * }}} - * The default model is `"nomic_small"`, if no name is provided. + * The default model is `"nomic_embed_v1"`, if no name is provided. * * For available pretrained models please see the * [[https://sparknlp.org/models?q=NomicEmbeddings Models Hub]]. @@ -86,7 +86,7 @@ import com.johnsnowlabs.ml.openvino.{OpenvinoWrapper, ReadOpenvinoModel, WriteOp * .setInputCol("text") * .setOutputCol("document") * - * val embeddings = NomicEmbeddings.pretrained("nomic_small", "en") + * val embeddings = NomicEmbeddings.pretrained("nomic_embed_v1", "en") * .setInputCols("document") * .setOutputCol("nomic_embeddings") * @@ -357,7 +357,7 @@ class NomicEmbeddings(override val uid: String) trait ReadablePretrainedNomicEmbeddingsModel extends ParamsAndFeaturesReadable[NomicEmbeddings] with HasPretrained[NomicEmbeddings] { - override val defaultModelName: Some[String] = Some("nomic_small") + override val defaultModelName: Some[String] = Some("nomic_embed_v1") /** Java compliant-overrides */ override def pretrained(): NomicEmbeddings = super.pretrained() From 3477f30f9aa25ed02a80701322a701f3bd5346ba Mon Sep 17 00:00:00 2001 From: Maziyar Panahi Date: Fri, 29 Nov 2024 19:58:35 +0100 Subject: [PATCH 2/4] name param should be str --- python/sparknlp/annotator/seq2seq/phi3_transformer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/sparknlp/annotator/seq2seq/phi3_transformer.py b/python/sparknlp/annotator/seq2seq/phi3_transformer.py index 56fde4220ff90b..98a28eeac47b96 100644 --- a/python/sparknlp/annotator/seq2seq/phi3_transformer.py +++ b/python/sparknlp/annotator/seq2seq/phi3_transformer.py @@ -308,7 +308,7 @@ def loadSavedModel(folder, spark_session, use_openvino=False): return Phi3Transformer(java_model=jModel) @staticmethod - def pretrained(name=phi_3_mini_128k_instruct, lang="en", remote_loc=None): + def pretrained(name="phi_3_mini_128k_instruct", lang="en", remote_loc=None): """Downloads and loads a pretrained model. Parameters From fdbf678e4904feb92de3ca0e40c91662d3c9c3bd Mon Sep 17 00:00:00 2001 From: Maziyar Panahi Date: Fri, 29 Nov 2024 23:16:26 +0100 Subject: [PATCH 3/4] adding CPMTransformer to ResourceDwonloader --- .../com/johnsnowlabs/nlp/pretrained/ResourceDownloader.scala | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/main/scala/com/johnsnowlabs/nlp/pretrained/ResourceDownloader.scala b/src/main/scala/com/johnsnowlabs/nlp/pretrained/ResourceDownloader.scala index 145bcc67f26b35..1b8d8ccb864fd1 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/pretrained/ResourceDownloader.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/pretrained/ResourceDownloader.scala @@ -690,7 +690,8 @@ object PythonResourceDownloader { "SnowFlakeEmbeddings" -> SnowFlakeEmbeddings, "CamemBertForZeroShotClassification" -> CamemBertForZeroShotClassification, "BertForMultipleChoice" -> BertForMultipleChoice, - "PromptAssembler" -> PromptAssembler) + "PromptAssembler" -> PromptAssembler, + "CPMTransformer"-> CPMTransformer) // List pairs of types such as the one with key type can load a pretrained model from the value type val typeMapper: Map[String, String] = Map("ZeroShotNerModel" -> "RoBertaForQuestionAnswering") From 15e3ac6ccc19d3601fea5a6fe879da00866a97a9 Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Mon, 2 Dec 2024 17:45:14 +0500 Subject: [PATCH 4/4] Update ResourceDownloader.scala --- .../johnsnowlabs/nlp/pretrained/ResourceDownloader.scala | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/main/scala/com/johnsnowlabs/nlp/pretrained/ResourceDownloader.scala b/src/main/scala/com/johnsnowlabs/nlp/pretrained/ResourceDownloader.scala index 1b8d8ccb864fd1..a6c1e4aff778a9 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/pretrained/ResourceDownloader.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/pretrained/ResourceDownloader.scala @@ -691,7 +691,11 @@ object PythonResourceDownloader { "CamemBertForZeroShotClassification" -> CamemBertForZeroShotClassification, "BertForMultipleChoice" -> BertForMultipleChoice, "PromptAssembler" -> PromptAssembler, - "CPMTransformer"-> CPMTransformer) + "CPMTransformer"-> CPMTransformer, + "NomicEmbeddings" -> NomicEmbeddings, + "NLLBTransformer" -> NLLBTransformer, + "Phi3Transformer" -> Phi3Transformer, + "QwenTransformer" -> QwenTransformer) // List pairs of types such as the one with key type can load a pretrained model from the value type val typeMapper: Map[String, String] = Map("ZeroShotNerModel" -> "RoBertaForQuestionAnswering")