Remove HF token from code, use .env instead as described in README.md

griko · griko · commit 5120cddc24c1 · 2025-02-17T23:55:00.000+02:00
diff --git a/README.md b/README.md
@@ -32,7 +32,8 @@ execute the first pipline only. If your task is to generate text from a given au
 an audio file as input - use (1) and (3). If you are training a new model and features is of the highest importance - 
 go for (1) and (2) and consider expanding the library when you are satisfied with the result (see example 3).  
 
-Configuration of all the components is made through the `pipline.yaml` configuration file.
+Configuration of all the components is made through the `pipline.yaml` configuration file. If using components that require a huggingface token, 
+create a `.env` file in the root directory and add the token as `huggingface_ACCESS_TOKEN=<your_token>` (currently required for pyannote-embedding, pyannote-vad and pyannote-sd).
 
 ## Pre-processing components
 ### Filelist-DataFrame Creator
diff --git a/src/notebook_examples/configs/example_pipeline.yaml b/src/notebook_examples/configs/example_pipeline.yaml
@@ -3,7 +3,6 @@ segment_name_separator: "_"
 intermediate_payload_path: 'results/example'
 # device: 'cpu'  # 'cpu'/'cuda'
 log_each_x_records: 100
-huggingface_ACCESS_TOKEN: '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'
 sampling_rate: 16000
 
 preprocessing:
diff --git a/src/notebook_examples/configs/pulp_fiction_pipeline.yaml b/src/notebook_examples/configs/pulp_fiction_pipeline.yaml
@@ -2,7 +2,6 @@ input_dir: '../cut_preprocessed'  # 'speech_examples', 'speech_examples_small'
 segment_name_separator: "_"
 intermediate_payload_path: 'results/pulp_fiction'
 # device: 'cpu'  # 'cpu'/'cuda'
-huggingface_ACCESS_TOKEN: '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'
 max_workers: 1  # set the number of workers for parallel threads
 sampling_rate: 16000
 latent_logger:
@@ -247,7 +246,7 @@ segment_classifier:
   gmm_clustering_diarization:
     classification_column_name: 'gmm_clustering_diarization'
     n_components: 6
-    covariance_type: 'full'  # �full�, �tied�, �diag�, �spherical�
+    covariance_type: 'full'  # �full�, �tied�, �diag�, �spherical�
     features_list: # assumes multiple column features are labeled 'i_<component_name>' where i is between
       # start_index (inclusive) and stop_index (exclusive)
       - speechbrain_embedding:
diff --git a/src/pipeline.yaml b/src/pipeline.yaml
@@ -2,7 +2,6 @@ input_dir: 'speech_examples_small'  # 'speech_examples', 'speech_examples_small'
 segment_name_separator: "_"
 intermediate_payload_path: 'results'
 # device: 'cpu'  # 'cpu'/'cuda'
-huggingface_ACCESS_TOKEN: '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'
 max_workers: 4  # set the number of workers for parallel threads
 sampling_rate: 16000
 latent_logger:
diff --git a/src/vanpy/utils/utils.py b/src/vanpy/utils/utils.py
@@ -96,12 +96,20 @@ def yaml_placeholder_replacement(full, val=None, initial=True) -> yaml.YAMLObjec
 def load_config(config_yaml_path: str = 'pipeline.yaml') -> Dict:
     """
     Load a YAML configuration file and replace any placeholders with their corresponding values.
+    If there is a .env file, load it and add content to config
+    
     :param config_yaml_path: path of the configuration file
     :return: configuration as a dictionary
     """
     with open(config_yaml_path, 'r') as f:
         config = yaml.load(f, Loader=yaml.FullLoader)
         config = yaml_placeholder_replacement(config)
+    # if there is a .env file, load it and add content to config
+    if os.path.exists('.env'):
+        with open('.env', 'r') as f:
+            for line in f:
+                key, value = line.strip().split('=', 1)
+                config[key] = value
     return config
 
 
diff --git a/tests/config.py b/tests/config.py
@@ -1,6 +1,5 @@
 config_test_PyannoteVAD = {
             'model_params': {},
-            'huggingface_ACCESS_TOKEN': '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',
             'performance_measurement': False,
             'add_segment_metadata': False,
             'output_dir': 'segmented_audio',