diff --git a/transforms/language/gneissweb_classification/gneissweb_classification.ipynb b/transforms/language/gneissweb_classification/gneissweb_classification.ipynb index 17a5a2e7b..6eeb25733 100644 --- a/transforms/language/gneissweb_classification/gneissweb_classification.ipynb +++ b/transforms/language/gneissweb_classification/gneissweb_classification.ipynb @@ -39,11 +39,11 @@ "| Configuration Parameters | Default | Description |\n", "|------------|----------|--------------|\n", "| gcls_model_credential | _unset_ | specifies the credential you use to get model. This will be huggingface token. [Guide to get huggingface token](https://huggingface.co/docs/hub/security-tokens) |\n", - "| gcls_model_file_name | _unset_ | specifies what filename of model you use to get model, like `model.bin` |\n", - "| gcls_model_url | _unset_ | specifies url that model locates. For fasttext, this will be repo nme of the model, like `facebook/fasttext-language-identification` |\n", + "| gcls_model_file_name | _unset_ | specifies what filename of models you use to get models, like [`model.bin`] |\n", + "| gcls_model_url | _unset_ | specifies url that models locate. For fasttext, this will be repo name of the models, like [`facebook/fasttext-language-identification`] |\n", "| gcls_content_column_name | `contents` | specifies name of the column containing documents |\n", - "| gcls_output_label_column_name | `label` | specifies name of the output column to hold predicted classes |\n", - "| gcls_output_score_column_name | `score` | specifies name of the output column to hold score of prediction |" + "| gcls_output_label_column_name | [`label`] | specifies name of the output columns to hold predicted classes |\n", + "| gcls_output_score_column_name | [`score`] | specifies name of the output columns to hold score of prediction |" ] }, { @@ -74,7 +74,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 3, "id": "badafb96-64d2-4bb8-9f3e-b23713fd5c3f", "metadata": {}, "outputs": [ @@ -82,21 +82,31 @@ "name": "stderr", "output_type": "stream", "text": [ - "09:52:55 INFO - parameters are : {'model_credential': 'PUT YOUR OWN HUGGINGFACE CREDENTIAL', 'model_file_name': 'model.bin', 'model_url': 'facebook/fasttext-language-identification', 'content_column_name': 'text', 'output_label_column_name': 'lang', 'output_score_column_name': 'score'}\n", - "09:52:55 INFO - pipeline id pipeline_id\n", - "09:52:55 INFO - code location None\n", - "09:52:55 INFO - data factory data_ is using local data access: input_folder - test-data/input output_folder - output\n", - "09:52:55 INFO - data factory data_ max_files -1, n_sample -1\n", - "09:52:55 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", - "09:52:55 INFO - orchestrator gcls started at 2025-01-27 09:52:55\n", - "09:52:55 INFO - Number of files is 3, source profile {'max_file_size': 0.3023223876953125, 'min_file_size': 0.037346839904785156, 'total_file_size': 0.4433746337890625}\n", + "13:44:52 INFO - parameters are : {'gcls_model_credential': 'PUT YOUR OWN HUGGINGFACE CREDENTIAL', 'gcls_model_file_name': [\"['model.bin']\"], 'gcls_model_url': [\"['facebook/fasttext-language-identification']\"], 'gcls_content_column_name': 'text', 'gcls_output_label_column_name': [\"['lang']\"], 'gcls_output_score_column_name': [\"['score']\"], 'gcls_n_processes': 1}\n", + "13:44:52 INFO - pipeline id pipeline_id\n", + "13:44:52 INFO - code location None\n", + "13:44:52 INFO - data factory data_ is using local data access: input_folder - test-data/input output_folder - output\n", + "13:44:52 INFO - data factory data_ max_files -1, n_sample -1\n", + "13:44:52 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "13:44:52 INFO - orchestrator gcls started at 2025-02-14 13:44:52\n", + "13:44:52 INFO - Number of files is 3, source profile {'max_file_size': 0.3023223876953125, 'min_file_size': 0.037346839904785156, 'total_file_size': 0.4433746337890625}\n", + "Warning : `load_model` does not return WordVectorModel or SupervisedModel any more, but a `FastText` object which is very similar.\n", + "13:44:53 INFO - Completed 1 files (33.33%) in 0.016 min\n", "Warning : `load_model` does not return WordVectorModel or SupervisedModel any more, but a `FastText` object which is very similar.\n", - "09:52:57 INFO - Completed 1 files (33.33%) in 0.01 min\n", - "09:52:57 INFO - Completed 2 files (66.67%) in 0.011 min\n", - "09:52:57 INFO - Completed 3 files (100.0%) in 0.014 min\n", - "09:52:57 INFO - Done processing 3 files, waiting for flush() completion.\n", - "09:52:57 INFO - done flushing in 0.0 sec\n", - "09:52:57 INFO - Completed execution in 0.029 min, execution result 0\n" + "13:44:54 INFO - Completed 2 files (66.67%) in 0.029 min\n", + "Warning : `load_model` does not return WordVectorModel or SupervisedModel any more, but a `FastText` object which is very similar.\n", + "13:44:55 INFO - Completed 3 files (100.0%) in 0.043 min\n", + "13:44:55 INFO - Done processing 3 files, waiting for flush() completion.\n", + "13:44:55 INFO - done flushing in 0.0 sec\n", + "13:44:55 INFO - Completed execution in 0.043 min, execution result 0\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 1.09 s, sys: 910 ms, total: 2 s\n", + "Wall time: 2.57 s\n" ] }, { @@ -105,7 +115,7 @@ "0" ] }, - "execution_count": 4, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } @@ -115,8 +125,8 @@ "Classification(input_folder= \"test-data/input\",\n", " output_folder= \"output\",\n", " gcls_model_credential= \"PUT YOUR OWN HUGGINGFACE CREDENTIAL\",\n", - " gcls_model_file_name= \"model.bin\",\n", - " gcls_model_url= \"facebook/fasttext-language-identification\",\n", + " gcls_model_file_name= [\"model.bin\"],\n", + " gcls_model_url= [\"facebook/fasttext-language-identification\"],\n", " gcls_content_column_name= \"text\").transform()" ] }, @@ -130,7 +140,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 4, "id": "7276fe84-6512-4605-ab65-747351e13a7c", "metadata": {}, "outputs": [ @@ -143,7 +153,7 @@ " 'output/test_01.parquet']" ] }, - "execution_count": 5, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -155,7 +165,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 5, "id": "845a75cf-f4a9-467d-87fa-ccbac1c9beb8", "metadata": {}, "outputs": [ @@ -299,7 +309,7 @@ "[200 rows x 4 columns]" ] }, - "execution_count": 8, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" }