updated README.md

krichelj · krichelj · commit 5c661d60b3af · 2025-02-18T19:56:23.000-08:00
diff --git a/README.md b/README.md
@@ -48,13 +48,13 @@ To demonstrate the use of the package, we consider a dataset with two levels of
 
 We further consider a pretrained 'main' model, for example one which employed the small version of [Meta's _DINO_V2_ architecture](https://dinov2.metademolab.com/) and was fine-tuned on ImageNet50 - a subset of the [ImageNet1K dataset](https://www.image-net.org/index.php) with 50 classes (which can be found [here](https://huggingface.co/datasets/lab-v2/ImageNet50)), which we want to analyze its ability to classify both levels of the hierarchy. An instance of such model (which can be found [here](https://huggingface.co/lab-v2/dinov2_vits14_imagenet_lr1e-06_BCE)) has the following performance:
 
-Fine-grain prior combined accuracy: <span style="color:green">76.57</span>% , fine-grain prior combined macro f1: <span style="color:green">76.1</span>%\
-Fine-grain prior combined macro precision: <span style="color:green">76.96</span>% , fine-grain prior combined macro recall: <span style="color:green">76.57</span>%
+Fine-grain prior combined accuracy: <code style="color:green">76.57%</code> , fine-grain prior combined macro f1: <code style="color:green">76.1%</code>\
+Fine-grain prior combined macro precision: <code style="color:green">76.96%</code> , fine-grain prior combined macro recall: <code style="color:green">76.57%</code>
 
-Coarse-grain prior combined accuracy: <span style="color:green">87.14</span>%, coarse-grain prior combined macro f1: <span style="color:green">85.77</span>%\
-Coarse-grain prior combined macro precision: <span style="color:green">87.36</span>%, coarse-grain prior combined macro recall: <span style="color:green">84.64</span>%
+Coarse-grain prior combined accuracy: <code style="color:green">87.14%</code>, coarse-grain prior combined macro f1: <code style="color:green">85.77%</code>\
+Coarse-grain prior combined macro precision: <code style="color:green">87.36%</code>, coarse-grain prior combined macro recall: <code style="color:green">84.64%</code>
 
-Total prior inconsistencies <span style="color:red">133/2100</span> (<span style="color:red">6.33</span>%)
+Total prior inconsistencies <code style="color:red">133/2100</code> (<code style="color:red">6.33%</code>)
 
 We also consider a 'secondary' model (which can be found [here](https://huggingface.co/lab-v2/dinov2_vitl14_imagenet_lr1e-06_BCE)), which employed the large version of the DINO_V2 architecture and was also fine-tuned on the ImageNet50 dataset, along with binary models which were trained on each class of the dataset.
 Consider the following code snippet to run the `run_experiment` function from PyEDCR.py:
@@ -81,15 +81,15 @@ run_experiment(config=imagenet_config)
 
 The code will initiate the rule learning pipeline, use the rules learned to mark errors in the predictions of the main model, and print out the performance metrics of the algorithm on the error class after running the f-EDR algorithm, which in this case will be:
 
-```
-error_accuracy: 89.0%
-error_balanced_accuracy: 84.23%
-error_precision: 81.65%
-error_recall: 74.31%
-error_f1: 77.81%
-recovered_constraints_precision: 100.0%
-recovered_constraints_recall: 59.36%
-recovered_constraints_f1_score: 74.5%
+```python
+error_accuracy: 89.0
+error_balanced_accuracy: 84.23
+error_precision: 81.65
+error_recall: 74.31
+error_f1: 77.81
+recovered_constraints_precision: 100.0
+recovered_constraints_recall: 59.36
+recovered_constraints_f1_score: 74.5
 ```
 
 For further details about the rule learning algorithm, and noise tolerance experiments, please refer to the [paper](https://arxiv.org/abs/2407.15192).
@@ -99,7 +99,7 @@ For further details about the rule learning algorithm, and noise tolerance exper
 This research was funded by ARO grant W911NF-24-1-0007.
 
 <p align="center">
-<a href="https://arl.devcom.army.mil/who-we-are/aro/">
+<a href="https://scai.engineering.asu.edu/">
 <img src="https://cdn.shopify.com/s/files/1/1095/6418/files/ASU-sun-devils-new-logo.jpg?v=1481918145" height="150"  alt=""/>
 </a>
 &emsp;
diff --git a/src/PyEDCR/utils/google_sheets_api.py b/src/PyEDCR/utils/google_sheets_api.py
@@ -0,0 +1,186 @@
+import os
+import typing
+import time
+import numpy as np
+
+import google_auth_oauthlib.flow
+import google.auth.transport.requests
+import google.oauth2.credentials
+import googleapiclient.discovery
+import googleapiclient.errors
+
+from src.PyEDCR.utils import paths
+
+
+with open(fr'{paths.CREDENTIALS_FOLDER}/spreadsheet_id.txt', "r") as file:
+    # Read the first line and strip any extra whitespace or newline characters
+    spreadsheet_id = file.readline().strip()
+
+
+def initiate_api() -> googleapiclient.discovery.Resource:
+    creds = None
+    # The file token.json stores the user's access and refresh tokens, and is
+    # created automatically when the authorization flow completes for the first time.
+    scopes = ["https://www.googleapis.com/auth/spreadsheets"]
+    if os.path.exists("../../../token.json"):
+        creds = (google.oauth2.credentials.Credentials.from_authorized_user_file(filename="../../../token.json",
+                                                                                 scopes=scopes))
+
+    if not creds or not creds.valid:
+        if creds and creds.expired and creds.refresh_token:
+            creds.refresh(google.auth.transport.requests.Request())
+        else:
+            flow = google_auth_oauthlib.flow.InstalledAppFlow.from_client_secrets_file(
+                client_secrets_file="../../../credentials/credentials.json",
+                scopes=scopes)
+            creds = flow.run_local_server(port=0)
+        # Save the credentials for the next run
+        with open("../../../token.json", "w") as token:
+            token.write(creds.to_json())
+
+    service = googleapiclient.discovery.build(serviceName="sheets",
+                                              version="v4",
+                                              credentials=creds)
+    sheet = service.spreadsheets()
+
+    return sheet
+
+
+__sheet: googleapiclient.discovery.Resource = initiate_api()
+
+
+def get_sheet_tab_name(main_model_name: str,
+                       data_str: str,
+                       secondary_model_name: str = None,
+                       binary: bool = False) -> str:
+    models_dict = {'vit_b_16': 'VIT_b_16',
+                   'dinov2_vits14': 'DINO V2 VIT14_s',
+                   'dinov2_vitl14': 'DINO V2 VIT14_l',
+                   'tresnet_m': 'Tresnet M',
+                   'vit_l_16': 'VIT_l_16'}
+    data_dict = {'military_vehicles': 'Military Vehicles',
+                 'imagenet': 'ImageNet',
+                 'openimage': 'OpenImage',
+                 'coco': 'COCO'}
+    main_model_name_str = models_dict[main_model_name]
+    data_set_str = data_dict[data_str]
+
+    secondary_model_str = ((' with ' + models_dict[secondary_model_name])
+                           if secondary_model_name is not None else '')
+    binary_str = ' with Binary' if binary else ''
+
+    return f"{main_model_name_str} on {data_set_str}{binary_str}{secondary_model_str}"
+
+
+def exponential_backoff(func: typing.Callable) -> typing.Callable:
+    """Decorator to retry with exponential backoff when rate limited."""
+
+    def wrapper(*args, **kwargs):
+        wait = 30  # Start with 30 seconds
+        while True:
+            try:
+                return func(*args, **kwargs)
+            except googleapiclient.errors.HttpError as e:
+                error_code = e.resp.status
+                if error_code == 429:
+                    print(f"Rate limit exceeded, waiting {wait} seconds...")
+                    time.sleep(wait)
+                    wait *= 1.1  # Exponential backoff
+                else:
+                    print(e)
+                    time.sleep(60)
+
+    return wrapper
+
+
+@exponential_backoff
+def update_sheet(range_: str,
+                 body: typing.Dict[str, typing.List[typing.List[typing.Union[float, str]]]]):
+    """Function to update Google Sheet and handle retries on rate limits."""
+
+    result = __sheet.values().update(
+        spreadsheetId=spreadsheet_id,
+        range=range_,
+        valueInputOption='USER_ENTERED',
+        body=body).execute()
+
+    print(f"{result.get('updatedCells')} cell updated to {range_}")
+
+
+@exponential_backoff
+def find_empty_rows_in_column(sheet_tab_name: str,
+                              column_letter: str):
+    # Fetch the column data
+    values = __sheet.values().get(spreadsheetId=spreadsheet_id,
+                                  range=f'{sheet_tab_name}!{column_letter}:{column_letter}').execute().get('values', [])
+
+    total_value_num = len(values)
+
+    # Identify empty rows
+    empty_row_indices = []
+    for index, value in enumerate(values, start=1):  # Starts counting from 1 (Google Sheets row numbers)
+        if not value:  # If the list is empty, the row is empty
+            empty_row_indices.append(index)
+
+    return empty_row_indices, total_value_num
+
+
+@exponential_backoff
+def get_values_from_columns(sheet_tab_name: str,
+                            column_letters: typing.List[str]):
+    ranges = [f'{sheet_tab_name}!{letter}2:{letter}' for letter in column_letters]
+    response = __sheet.values().batchGet(
+        spreadsheetId=spreadsheet_id,
+        ranges=ranges
+    ).execute()
+
+    return [np.array([e[0].strip('%') if e[0] != 'None' else 0
+                      for e in response_i.get('values', []) if e[0] != '#N/A'],
+                     dtype=float) for response_i in response['valueRanges']]
+
+
+@exponential_backoff
+def get_maximal_epsilon(sheet_tab_name: str):
+    # Specify the separate ranges to fetch
+    data_range_b_to_e = f'{sheet_tab_name}!B2:E'
+    data_range_g = f'{sheet_tab_name}!G2:G'
+    column_a_range = f'{sheet_tab_name}!A2:A'
+
+    # Fetch the data using batchGet
+    response = __sheet.values().batchGet(
+        spreadsheetId=spreadsheet_id,
+        ranges=[data_range_b_to_e, data_range_g, column_a_range]
+    ).execute()
+
+    # Extract the values for each range
+    data_values_b_to_e = response['valueRanges'][0].get('values', [])
+    data_values_g = response['valueRanges'][1].get('values', [])
+    column_a_values = response['valueRanges'][2].get('values', [])
+
+    # Standardize the length of each row
+    max_length_b_to_e = max((len(row) for row in data_values_b_to_e), default=0)
+    data_values_b_to_e = [row + [None] * (max_length_b_to_e - len(row)) for row in data_values_b_to_e]
+
+    max_length_g = max((len(row) for row in data_values_g), default=0)
+    data_values_g = [row + [None] * (max_length_g - len(row)) for row in data_values_g]
+
+    # Convert data to NumPy arrays, handling percentages and missing values
+    data_array_b_to_e = np.array(
+        [[float(item.strip('%')) if isinstance(item, str) and item else 0 for item in row] for row in
+         data_values_b_to_e])
+    data_array_g = np.array([[float(row[0]) if row and row[0] else 0] for row in data_values_g])
+
+    # Concatenate columns B-E with column G
+    data_array = np.hstack((data_array_b_to_e, data_array_g))
+
+    # Calculate the sum of each row using NumPy's sum function along axis 1 (rows)
+    row_sums = np.sum(data_array, axis=1)
+
+    # Find the index of the row with the maximum sum
+    max_index = np.argmax(row_sums)
+
+    # Retrieve the value from column A for the row with the maximum sum
+    if max_index < len(column_a_values):
+        return column_a_values[max_index][0]
+    else:
+        return None
diff --git a/src/PyEDCR/utils/paths.py b/src/PyEDCR/utils/paths.py
@@ -1,7 +1,5 @@
 import pathlib
 
-from google.auth.environment_vars import CREDENTIALS
-
 ROOT_PATH = pathlib.Path(__file__).parent.parent.parent.parent
 DATA_FOLDER = rf'{ROOT_PATH}/data'
 RESULTS_FOLDER = rf'{ROOT_PATH}/results'