hassonlab
diff --git a/‎Makefile
Lines changed: 7 additions & 9 deletions b/‎Makefile
Lines changed: 7 additions & 9 deletions
diff --git a/‎requirements.yml
Lines changed: 98 additions & 26 deletions b/‎requirements.yml
Lines changed: 98 additions & 26 deletions
diff --git a/‎scripts/electrode_utils.py
Lines changed: 10 additions & 15 deletions b/‎scripts/electrode_utils.py
Lines changed: 10 additions & 15 deletions
diff --git a/‎scripts/tfsemb_LMBase.py
Lines changed: 36 additions & 11 deletions b/‎scripts/tfsemb_LMBase.py
Lines changed: 36 additions & 11 deletions
@@ -69,15 +69,15 @@ create-sig-pickle:
 	mkdir -p logs
 	$(CMD) scripts/tfspkl_main.py \
 			--project-id $(PRJCT_ID) \
-			--sig-elec-file data/$(PRJCT_ID)/all-electrodes.csv
+			--sig-elec-file all-electrodes2.csv
 
 # upload pickles to google cloud bucket
 # on bucket we use 247 not tfs, so manually adjust as needed
 # upload-pickle: pid=247
 upload-pickle: pid=podcast
 upload-pickle:
 	for sid in $(SID_LIST); do \
-		gsutil -m rsync results/$(PRJCT_ID)/$$sid/pickles/ gs://247-podcast-data/$(pid)-pickles/$$sid; \
+		gsutil -m rsync -rd results/$(PRJCT_ID)/$$sid/pickles/ gs://247-podcast-data/$(pid)-pickles/$$sid; \
 	done
 
 # upload raw data to google cloud bucket
@@ -107,7 +107,7 @@ download-247-pickles:
 "facebook/opt-125m", "facebook/opt-350m", "facebook/opt-1.3b", \
 "facebook/opt-2.7b", "facebook/opt-6.7b", "facebook/opt-30b", \
 "facebook/blenderbot_small-90M"}
-%-embeddings: CNXT_LEN := 1024 512 256 128 64 32 16 8 4 2 1
+%-embeddings: CNXT_LEN := 1024
 %-embeddings: LAYER := all
 # {'all' for all layers | 'last' for the last layer | (list of) integer(s) >= 1}
 # Note: embeddings file is the same for all podcast subjects \
@@ -154,13 +154,11 @@ concatenate-embeddings:
 	done;
 
 # Podcast: copy embeddings to other subjects as well
-# for sid in 662 717 723 741 742 763 798 777; do 
 copy-embeddings:
-	@for fn in results/podcast/661/pickles/*embeddings.pkl; do \
-		for sid in 777; do \
-			cp -pf $$fn $$(echo $$fn | sed "s/661/$$sid/g"); \
-		done; \
-	done
+	fn=results/podcast/661/pickles/embeddings
+	for sid in 662 717 723 741 742 763 798 777; do \
+		cp -rpf $$fn $$(echo $$fn | sed "s/661/$$sid/g"); \
+	done; \
 
 
 # Download huggingface models to cache (before generating embeddings)
 
@@ -2,57 +2,83 @@ name: 247-main
 channels:
   - defaults
 dependencies:
-  - _libgcc_mutex=0.1
-  - _openmp_mutex=5.1
-  - bzip2=1.0.8
-  - ca-certificates=2022.4.26
-  - ld_impl_linux-64=2.38
-  - libffi=3.3
-  - libgcc-ng=11.2.0
-  - libgomp=11.2.0
-  - libstdcxx-ng=11.2.0
-  - libuuid=1.0.3
-  - ncurses=6.3
-  - openssl=1.1.1p
-  - python=3.10.4
-  - readline=8.1.2
-  - sqlite=3.38.5
-  - tk=8.6.12
-  - tzdata=2022a
-  - wheel=0.37.1
-  - xz=5.2.5
-  - zlib=1.2.12
+  - _libgcc_mutex=0.1=main
+  - _openmp_mutex=5.1=1_gnu
+  - bzip2=1.0.8=h7b6447c_0
+  - ca-certificates=2022.4.26=h06a4308_0
+  - ld_impl_linux-64=2.38=h1181459_1
+  - libffi=3.3=he6710b0_2
+  - libgcc-ng=11.2.0=h1234567_1
+  - libgomp=11.2.0=h1234567_1
+  - libstdcxx-ng=11.2.0=h1234567_1
+  - libuuid=1.0.3=h7f8727e_2
+  - ncurses=6.3=h5eee18b_3
+  - openssl=1.1.1p=h5eee18b_0
+  - python=3.10.4=h12debd9_0
+  - readline=8.1.2=h7f8727e_1
+  - sqlite=3.38.5=hc218d9a_0
+  - tk=8.6.12=h1ccaba5_0
+  - tzdata=2022a=hda174b7_0
+  - wheel=0.37.1=pyhd3eb1b0_0
+  - xz=5.2.5=h7f8727e_1
+  - zlib=1.2.12=h7f8727e_2
   - pip:
+    - accelerate==0.14.0
+    - aiohttp==3.8.3
+    - aiosignal==1.2.0
+    - alabaster==0.7.12
     - anyio==3.6.1
+    - appdirs==1.4.4
     - argon2-cffi==21.3.0
     - argon2-cffi-bindings==21.2.0
     - asttokens==2.0.5
+    - async-timeout==4.0.2
     - attrs==21.4.0
+    - audioread==3.0.0
     - babel==2.10.3
     - backcall==0.2.0
     - beautifulsoup4==4.11.1
-    - black==22.6.0
+    - black==22.12.0
     - bleach==5.0.1
+    - boltons==21.0.0
+    - bracex==2.3.post1
     - certifi==2022.6.15
     - cffi==1.15.1
     - charset-normalizer==2.1.0
     - click==8.1.3
+    - click-extra==3.5.0
+    - click-log==0.4.0
+    - cloup==2.0.0.post1
+    - colorama==0.4.5
+    - commentjson==0.9.0
     - cycler==0.11.0
+    - datasets==2.5.2
     - debugpy==1.6.2
     - decorator==5.1.1
     - defusedxml==0.7.1
+    - dill==0.3.5.1
+    - docutils==0.19
     - entrypoints==0.4
     - executing==0.8.3
     - fastjsonschema==2.15.3
+    - ffmpeg-python==0.2.0
     - filelock==3.7.1
     - fonttools==4.34.4
+    - frozenlist==1.3.1
+    - fsspec==2022.8.2
+    - future==0.18.2
     - gensim==4.2.0
     - h5py==3.7.0
-    - huggingface-hub==0.8.1
+    - htmlmin==0.1.12
+    - huggingface-hub==0.10.0
+    - icecream==2.1.3
     - idna==3.3
+    - imagehash==4.2.1
+    - imagesize==1.4.1
     - ipykernel==6.15.1
     - ipython==8.4.0
     - ipython-genutils==0.2.0
+    - ipywidgets==7.7.1
     - isort==5.10.1
     - jedi==0.18.1
     - jinja2==3.1.2
@@ -65,73 +91,119 @@ dependencies:
     - jupyterlab==3.4.3
     - jupyterlab-pygments==0.2.2
     - jupyterlab-server==2.15.0
+    - jupyterlab-widgets==1.1.1
     - kiwisolver==1.4.3
+    - lark-parser==0.7.8
+    - librosa==0.9.2
     - llvmlite==0.38.1
     - markupsafe==2.1.1
     - mat73==0.59
     - matplotlib==3.5.2
     - matplotlib-inline==0.1.3
+    - mergedeep==1.3.4
+    - missingno==0.5.1
     - mistune==0.8.4
+    - more-itertools==8.14.0
+    - multidict==6.0.2
+    - multimethod==1.8
+    - multiprocess==0.70.13
     - mypy-extensions==0.4.3
     - nbclassic==0.4.2
     - nbclient==0.6.6
     - nbconvert==6.5.0
     - nbformat==5.4.0
     - nest-asyncio==1.5.5
+    - networkx==2.8.5
     - nltk==3.7
+    - notebook==6.4.12
     - notebook-shim==0.1.0
     - numba==0.55.2
     - numexpr==2.8.3
     - numpy==1.22.4
+    - nvidia-cublas-cu11==11.10.3.66
+    - nvidia-cuda-nvrtc-cu11==11.7.99
+    - nvidia-cuda-runtime-cu11==11.7.99
+    - nvidia-cudnn-cu11==8.5.0.96
+    - packageurl-python==0.10.4
     - packaging==21.3
+    - pallets-sphinx-themes==2.0.2
     - pandas==1.4.3
+    - pandas-profiling==3.2.0
     - pandocfilters==1.5.0
     - parso==0.8.3
     - pathspec==0.9.0
     - pexpect==4.8.0
+    - phik==0.12.2
     - pickleshare==0.7.5
     - pillow==9.2.0
     - pip==21.2.4
     - platformdirs==2.5.2
+    - pooch==1.6.0
     - prometheus-client==0.14.1
     - prompt-toolkit==3.0.30
     - psutil==5.9.1
     - ptyprocess==0.7.0
     - pure-eval==0.2.2
+    - pyarrow==9.0.0
     - pycparser==2.21
+    - pydantic==1.9.2
     - pygments==2.12.0
+    - pygments-ansi-color==0.1.0
     - pyparsing==3.0.9
     - pyrsistent==0.18.1
     - python-dateutil==2.8.2
     - pytz==2022.1
+    - pywavelets==1.3.0
     - pyyaml==6.0
     - pyzmq==23.2.0
     - regex==2022.7.9
     - requests==2.28.1
+    - resampy==0.4.2
+    - responses==0.18.0
     - scikit-learn==1.1.1
     - scipy==1.8.1
+    - seaborn==0.11.2
     - send2trash==1.8.0
     - setuptools==61.2.0
     - six==1.16.0
     - smart-open==6.0.0
     - sniffio==1.2.0
+    - snowballstemmer==2.2.0
+    - soundfile==0.11.0
     - soupsieve==2.3.2.post1
+    - sphinx==5.3.0
+    - sphinxcontrib-applehelp==1.0.2
+    - sphinxcontrib-devhelp==1.0.2
+    - sphinxcontrib-htmlhelp==2.0.0
+    - sphinxcontrib-jsmath==1.0.1
+    - sphinxcontrib-qthelp==1.0.3
+    - sphinxcontrib-serializinghtml==1.1.5
     - stack-data==0.3.0
+    - tabulate==0.9.0
+    - tangled-up-in-unicode==0.2.0
     - terminado==0.15.0
     - threadpoolctl==3.1.0
     - tinycss2==1.1.1
     - tokenizers==0.12.1
     - tomli==2.0.1
-    - torch==1.12.0+cu113
-    - torchaudio==0.12.0+cu113
-    - torchvision==0.13.0+cu113
+    - tomli-w==1.0.0
+    - torch==1.13.1
+    - torchaudio==0.13.1
+    - torchvision==0.14.1
     - tornado==6.2
     - tqdm==4.64.0
     - traitlets==5.3.0
-    - transformers==4.20.1
+    - transformers==4.25.1
     - typing-extensions==4.3.0
     - urllib3==1.26.10
+    - visions==0.7.4
+    - wcmatch==8.4.1
     - wcwidth==0.2.5
     - webencodings==0.5.1
     - websocket-client==1.3.3
+    - whisper==1.0
+    - widgetsnbextension==3.6.1
+    - xmltodict==0.13.0
+    - xxhash==3.0.0
+    - yarl==1.8.1
 prefix: /home/hgazula/.conda/envs/247-main
@@ -6,6 +6,7 @@
 
 import numpy as np
 from scipy.io import loadmat
+from tfspkl_config import ELECTRODE_FOLDER_MAP
 
 
 def get_electrode(CONFIG, elec_id):
@@ -19,21 +20,15 @@ def get_electrode(CONFIG, elec_id):
     """
     conversation, electrode = elec_id
 
-    if CONFIG["project_id"] == "podcast":
-        search_str = conversation + f"/preprocessed_all/*_{electrode}.mat"
-    elif CONFIG["project_id"] == "tfs":
-        if CONFIG["subject"] == "7170":
-            search_str = conversation + f"/preprocessed_v2/*_{electrode}.mat"
-            # TODO: check if it is preprocessed or preprocessed_v2
-        elif CONFIG["subject"] == "798":
-            search_str = (
-                conversation + f"/preprocessed_allElec/*_{electrode}.mat"
-            )
-        else:
-            search_str = conversation + f"/preprocessed/*_{electrode}.mat"
-    else:
-        print("Incorrect Project ID")
-        sys.exit()
+    electrode_folder = ELECTRODE_FOLDER_MAP.get(CONFIG["project_id"], None).get(
+        CONFIG["subject"], None
+    )
+
+    if not electrode_folder:
+        print("Incorrect Project ID or Subject")
+        exit()
+
+    search_str = conversation + f"/{electrode_folder}/*_{electrode}.mat"
 
     mat_fn = glob.glob(search_str)
     if mat_fn:
 
@@ -7,6 +7,27 @@
 from utils import save_pickle as svpkl
 
 
+def clean_lm_model_name(item):
+    """Remove unnecessary parts from the language model name.
+
+    Args:
+        item (str/list): full model name from HF Hub
+
+    Returns:
+        (str/list): pretty model name
+
+    Example:
+        clean_lm_model_name(EleutherAI/gpt-neo-1.3B) == 'gpt-neo-1.3B'
+    """
+    if isinstance(item, str):
+        return item.split("/")[-1]
+
+    if isinstance(item, list):
+        return [clean_lm_model_name(i) for i in item]
+
+    print("Invalid input. Please check.")
+
+
 def add_vocab_columns(args, df, column=None):
     """Add columns to the dataframe indicating whether each word is in the
     vocabulary of the language models we're using.
@@ -27,17 +48,20 @@ def add_vocab_columns(args, df, column=None):
                 model, local_files_only=False
             )
 
-        key = model.split("/")[-1]
+        key = clean_lm_model_name(model)
         print(f"Adding column: (token) in_{key}")
 
         try:
             curr_vocab = tokenizer.vocab
         except AttributeError:
             curr_vocab = tokenizer.get_vocab()
 
-        df[f"in_{key}"] = df[column].apply(
-            lambda x: isinstance(curr_vocab.get(x), int)
-        )
+        def helper(x):
+            if len(tokenizer.tokenize(x)) == 1:
+                return isinstance(curr_vocab.get(tokenizer.tokenize(x)[0]), int)
+            return False
+
+        df[f"in_{key}"] = df[column].apply(helper)
 
     return df
 
@@ -49,16 +73,17 @@ def main():
 
     base_df = load_pickle(args.labels_pickle, "labels")
 
+    glove = api.load("glove-wiki-gigaword-50")
+    base_df["in_glove50"] = base_df.word.str.lower().apply(
+        lambda x: isinstance(glove.key_to_index.get(x), int)
+    )
+
     if args.embedding_type == "glove50":
-        base_df = add_vocab_columns(args, base_df, column="word")
+        base_df = base_df[base_df["in_glove50"]]
+        base_df = add_vocab_columns(args, base_df, column="word", flag=True)
     else:
-        # Add glove
-        glove = api.load("glove-wiki-gigaword-50")
-        base_df["in_glove"] = base_df.word.str.lower().apply(
-            lambda x: isinstance(glove.key_to_index.get(x), int)
-        )
         base_df = tokenize_and_explode(args, base_df)
-        base_df = add_vocab_columns(args, base_df, column="token")
+        base_df = add_vocab_columns(args, base_df, column="token2word")
 
     svpkl(base_df, args.base_df_file)