update makefile for using multiple classifiers

IBM · Feb 14, 2025 · 13f5c4f · 13f5c4f
1 parent d46b112
commit 13f5c4f
Showing 1 changed file with 36 additions and 26 deletions.
diff --git a/transforms/language/gneissweb_classification/gneissweb_classification.ipynb b/transforms/language/gneissweb_classification/gneissweb_classification.ipynb
@@ -39,11 +39,11 @@
     "| Configuration Parameters  | Default  | Description |\n",
     "|------------|----------|--------------|\n",
     "| gcls_model_credential | _unset_ | specifies the credential you use to get model. This will be huggingface token. [Guide to get huggingface token](https://huggingface.co/docs/hub/security-tokens) |\n",
-    "| gcls_model_file_name | _unset_ | specifies what filename of model you use to get model, like `model.bin` |\n",
-    "| gcls_model_url | _unset_ |  specifies url that model locates. For fasttext, this will be repo nme of the model, like `facebook/fasttext-language-identification` |\n",
+    "| gcls_model_file_name | _unset_ | specifies what filename of models you use to get models, like [`model.bin`] |\n",
+    "| gcls_model_url | _unset_ |  specifies url that models locate. For fasttext, this will be repo name of the models, like [`facebook/fasttext-language-identification`] |\n",
     "| gcls_content_column_name | `contents` | specifies name of the column containing documents |\n",
-    "| gcls_output_label_column_name | `label` | specifies name of the output column to hold predicted classes |\n",
-    "| gcls_output_score_column_name | `score` | specifies name of the output column to hold score of prediction |"
+    "| gcls_output_label_column_name | [`label`] | specifies name of the output columns to hold predicted classes |\n",
+    "| gcls_output_score_column_name | [`score`] | specifies name of the output columns to hold score of prediction |"
    ]
   },
   {
@@ -74,29 +74,39 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 3,
    "id": "badafb96-64d2-4bb8-9f3e-b23713fd5c3f",
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "09:52:55 INFO - parameters are : {'model_credential': 'PUT YOUR OWN HUGGINGFACE CREDENTIAL', 'model_file_name': 'model.bin', 'model_url': 'facebook/fasttext-language-identification', 'content_column_name': 'text', 'output_label_column_name': 'lang', 'output_score_column_name': 'score'}\n",
-      "09:52:55 INFO - pipeline id pipeline_id\n",
-      "09:52:55 INFO - code location None\n",
-      "09:52:55 INFO - data factory data_ is using local data access: input_folder - test-data/input output_folder - output\n",
-      "09:52:55 INFO - data factory data_ max_files -1, n_sample -1\n",
-      "09:52:55 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n",
-      "09:52:55 INFO - orchestrator gcls started at 2025-01-27 09:52:55\n",
-      "09:52:55 INFO - Number of files is 3, source profile {'max_file_size': 0.3023223876953125, 'min_file_size': 0.037346839904785156, 'total_file_size': 0.4433746337890625}\n",
+      "13:44:52 INFO - parameters are : {'gcls_model_credential': 'PUT YOUR OWN HUGGINGFACE CREDENTIAL', 'gcls_model_file_name': [\"['model.bin']\"], 'gcls_model_url': [\"['facebook/fasttext-language-identification']\"], 'gcls_content_column_name': 'text', 'gcls_output_label_column_name': [\"['lang']\"], 'gcls_output_score_column_name': [\"['score']\"], 'gcls_n_processes': 1}\n",
+      "13:44:52 INFO - pipeline id pipeline_id\n",
+      "13:44:52 INFO - code location None\n",
+      "13:44:52 INFO - data factory data_ is using local data access: input_folder - test-data/input output_folder - output\n",
+      "13:44:52 INFO - data factory data_ max_files -1, n_sample -1\n",
+      "13:44:52 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n",
+      "13:44:52 INFO - orchestrator gcls started at 2025-02-14 13:44:52\n",
+      "13:44:52 INFO - Number of files is 3, source profile {'max_file_size': 0.3023223876953125, 'min_file_size': 0.037346839904785156, 'total_file_size': 0.4433746337890625}\n",
+      "Warning : `load_model` does not return WordVectorModel or SupervisedModel any more, but a `FastText` object which is very similar.\n",
+      "13:44:53 INFO - Completed 1 files (33.33%) in 0.016 min\n",
       "Warning : `load_model` does not return WordVectorModel or SupervisedModel any more, but a `FastText` object which is very similar.\n",
-      "09:52:57 INFO - Completed 1 files (33.33%) in 0.01 min\n",
-      "09:52:57 INFO - Completed 2 files (66.67%) in 0.011 min\n",
-      "09:52:57 INFO - Completed 3 files (100.0%) in 0.014 min\n",
-      "09:52:57 INFO - Done processing 3 files, waiting for flush() completion.\n",
-      "09:52:57 INFO - done flushing in 0.0 sec\n",
-      "09:52:57 INFO - Completed execution in 0.029 min, execution result 0\n"
+      "13:44:54 INFO - Completed 2 files (66.67%) in 0.029 min\n",
+      "Warning : `load_model` does not return WordVectorModel or SupervisedModel any more, but a `FastText` object which is very similar.\n",
+      "13:44:55 INFO - Completed 3 files (100.0%) in 0.043 min\n",
+      "13:44:55 INFO - Done processing 3 files, waiting for flush() completion.\n",
+      "13:44:55 INFO - done flushing in 0.0 sec\n",
+      "13:44:55 INFO - Completed execution in 0.043 min, execution result 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "CPU times: user 1.09 s, sys: 910 ms, total: 2 s\n",
+      "Wall time: 2.57 s\n"
      ]
     },
     {
@@ -105,7 +115,7 @@
        "0"
       ]
      },
-     "execution_count": 4,
+     "execution_count": 3,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -115,8 +125,8 @@
     "Classification(input_folder= \"test-data/input\",\n",
     "        output_folder= \"output\",\n",
     "        gcls_model_credential= \"PUT YOUR OWN HUGGINGFACE CREDENTIAL\",\n",
-    "        gcls_model_file_name= \"model.bin\",\n",
-    "        gcls_model_url= \"facebook/fasttext-language-identification\",\n",
+    "        gcls_model_file_name= [\"model.bin\"],\n",
+    "        gcls_model_url= [\"facebook/fasttext-language-identification\"],\n",
     "        gcls_content_column_name= \"text\").transform()"
    ]
   },
@@ -130,7 +140,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 4,
    "id": "7276fe84-6512-4605-ab65-747351e13a7c",
    "metadata": {},
    "outputs": [
@@ -143,7 +153,7 @@
        " 'output/test_01.parquet']"
       ]
      },
-     "execution_count": 5,
+     "execution_count": 4,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -155,7 +165,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 5,
    "id": "845a75cf-f4a9-467d-87fa-ccbac1c9beb8",
    "metadata": {},
    "outputs": [
@@ -299,7 +309,7 @@
        "[200 rows x 4 columns]"
       ]
      },
-     "execution_count": 8,
+     "execution_count": 5,
      "metadata": {},
      "output_type": "execute_result"
     }