Nospoko · WojciechMat · Jan 28, 2025 · Jan 29, 2025 · Jan 30, 2025 · Jan 30, 2025
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -20,7 +20,7 @@ repos:
     rev: 23.7.0
     hooks:
     - id: black
-      args: [--line-length=130]
+      args: [--line-length=120]
       additional_dependencies: ['click==8.0.4']
   - repo: https://github.com/pycqa/isort
     rev: 5.12.0

diff --git a/README.md b/README.md
@@ -1,5 +1,23 @@
 # Piano-GPT: MIDI Piano Music Generation
 
+## Quickstart
+
+Train a 10M model:
+
+```sh
+# This will create checkpoints in ./tmp/checkpoints and logs in wandb
+python -m gpt2.train dataset=small model=gpt2_10M
+
+# No wandb, small memory footprint
+python -m gpt2.train dataset=small model=gpt2_10M data.batch_size=2 logging.wandb_log=false
+```
+
+Calculate PIANO metrics:
+
+```sh
+python -m gpt2.high_level_piano_eval init_from=<checkpoint path>
+```
+
 ## Overview
 
 Piano-GPT is a project leveraging the GPT-2 architecture for generating and processing MIDI piano music. It introduces the PIANO (Performance Inference And Note Orchestration) dataset, a multi-task benchmark for voice and dynamic reconstruction in MIDI piano rolls.
@@ -28,8 +46,7 @@ The PIANO dataset is designed to standardize approaches and provide a benchmark
 
 ## Project Structure
 
-- `artifacts.py`: Utility functions and constants
-- `checkpoints/`: Saved model checkpoints
+- `tmp/checkpoints/`: Saved model checkpoints
 - `dashboards/`: Streamlit dashboards for data visualization
 - `data/`: Dataset handling and preprocessing modules
 - `database/`: Database connection and management utilities
@@ -66,8 +83,8 @@ gpt2/train.py --config-name=gpt2_pretraining \
 data.batch_size=32 \
 optimizer.gradient_accumulation_steps=8 \
 optimizer.max_iters=30000 \
-data.sequence_length=4096 \
-dataset.extra_datasets="['roszcz/maestro-sustain-v2', 'roszcz/giant-midi-sustain-v2', 'roszcz/pianofor-ai-sustain-v2']" \
+data.context_size=4096 \
+dataset.extra_datasets="['epr-labs/maestro-sustain-v2', 'epr-labs/giant-midi-sustain-v2', 'epr-labs/pianofor-ai-sustain-v2']" \
 dataset.augmentation.max_pitch_shift=5 \
 "dataset.augmentation.speed_change_factors=[0.975, 0.95, 1.025, 1.05]" \
 lr.warmup_iters=1000 \
@@ -88,9 +105,9 @@ tasks = subsequence \
 data.batch_size=64 \
 optimizer.gradient_accumulation_steps=4 \
 optimizer.max_iters=30000 \
-data.sequence_length=1024 \
+data.context_size=1024 \
 data.notes_per_record=128 \
-dataset.extra_datasets="['roszcz/maestro-sustain-v2', 'roszcz/giant-midi-sustain-v2', 'roszcz/pianofor-ai-sustain-v2']" \
+dataset.extra_datasets="['epr-labs/maestro-sustain-v2', 'epr-labs/giant-midi-sustain-v2', 'epr-labs/pianofor-ai-sustain-v2']" \
 dataset.augmentation.max_pitch_shift=5 \
 dataset.augmentation.speed_change_factors="[0.95, 1.05]" \
 lr.learning_rate=8e-5 \
@@ -115,9 +132,9 @@ system.data_workers=124 \
 optimizer.gradient_accumulation_steps=4 \
 task=next_token_prediction_with_composer \
 eval_iters=200 eval_interval=1000 \
-"dataset.extra_datasets=['roszcz/maestro-sustain-v2', 'roszcz/giant-midi-sustain-v2', 'roszcz/pianofor-ai-sustain-v2']" \
+"dataset.extra_datasets=['epr-labs/maestro-sustain-v2', 'epr-labs/giant-midi-sustain-v2', 'epr-labs/pianofor-ai-sustain-v2']" \
 data.batch_size=20 \
-data.sequence_length=4096 \
+data.context_size=4096 \
 logging.wandb_run_name_suffix=huge-pretraining-4096-ctx \
 tokenizer=awesome \
 logging.wandb_project=piano-awesome-gpt

diff --git a/dashboards/piano_dataset_review.py b/dashboards/piano_dataset_review.py
@@ -8,7 +8,7 @@
 import matplotlib.pyplot as plt
 from datasets import Dataset, load_dataset
 from midi_tokenizers import ExponentialTimeTokenizer
-from piano_dataset.piano_tasks import ParametricTaskManager
+from piano_dataset.piano_tasks import PianoTaskManager
 
 from data.piano_dataset import PianoDataset
 from artifacts import dataset_tokens, composer_tokens
@@ -58,7 +58,7 @@ def load_piano_dataset(
     config: dict,
     dataset_name: str,
     dataset_split: str,
-    sequence_length: int,
+    context_size: int,
     notes_per_record: int,
     loss_masking: str,
     selected_composers: list[str],
@@ -78,13 +78,13 @@ def filter_dataset(record):
         return composer_match and title_match
 
     filtered_dataset = dataset.filter(filter_dataset)
-    parametric_task_manager = ParametricTaskManager.load_default()
+    parametric_task_manager = PianoTaskManager.load_default()
 
     tokenizer = ExponentialTimeTokenizer(**tokenizer_parameters)
     piano_dataset = PianoDataset(
         dataset=filtered_dataset,
         tokenizer=tokenizer,
-        sequence_length=sequence_length,
+        context_size=context_size,
         notes_per_record=notes_per_record,
         piano_task_manager=parametric_task_manager,
         loss_masking=loss_masking,
@@ -110,7 +110,7 @@ def main():
                 value=256,
             )
         with col2:
-            sequence_length = st.number_input(
+            context_size = st.number_input(
                 label="Sequence Length",
                 min_value=1,
                 value=2048,
@@ -148,7 +148,7 @@ def main():
 
         st.form_submit_button(label="Update Tokenizer")
 
-    parametric_task_manager = ParametricTaskManager.load_default()
+    parametric_task_manager = PianoTaskManager.load_default()
 
     config = {
         "base_dataset_name": base_dataset_name,
@@ -186,7 +186,7 @@ def main():
         config=config,
         dataset_name=dataset_name,
         dataset_split=dataset_split,
-        sequence_length=sequence_length,
+        context_size=context_size,
         notes_per_record=notes_per_record,
         loss_masking=loss_masking,
         selected_composers=selected_composers,

diff --git a/data/dataset.py b/data/dataset.py
@@ -1,4 +1,3 @@
-from typing import Literal
 from abc import abstractmethod
 
 from datasets import Dataset as HuggingFaceDataset
@@ -19,7 +18,6 @@ def __init__(
         self,
         dataset: HuggingFaceDataset,
         tokenizer: ExponentialTimeTokenizer | AwesomeMidiTokenizer,
-        loss_masking: Literal["finetuning", "pretraining"] = "pretraining",
     ):
         """
         Initialize the MidiDataset.
@@ -32,7 +30,6 @@ def __init__(
 
         # MidiTokenizer which was used during creation of the dataset
         self.tokenizer = tokenizer
-        self.loss_masking = loss_masking
 
         # Dataset with tokenized MIDI data
         self.dataset = dataset

diff --git a/data/musicality.py b/data/musicality.py
@@ -0,0 +1,93 @@
+import re
+
+
+class MusicManager:
+    dataset_tokens = [
+        "<MAESTRO>",
+        "<PIJAMA>",
+        "<VGMIDI>",
+        "<MUSIC-NET>",
+        "<PIANO-MIDI-DE>",
+        "<LAKH-LMD-FULL>",
+        "<GIANT-MIDI>",
+        "<IMSLP>",
+        "<ATEPP-1.1>",
+        "<PIANO_FOR_AI>",
+    ]
+    composer_tokens = [
+        "<SCRIABIN>",
+        "<FRANCK>",
+        "<MOZART>",
+        "<CHOPIN>",
+        "<MENDELSSON>",
+        "<LISZT>",
+        "<SCHUBERT>",
+        "<BRAHMS>",
+        "<HAYDN>",
+        "<BEETHOVEN>",
+        "<BALAKIREV>",
+        "<SCHUMANN>",
+        "<RACHMANIOFF>",
+        "<UNKNOWN_COMPOSER>",
+        "<BACH>",
+    ]
+
+    composer_token_map: dict[str, str] = {
+        "Alexander Scriabin": "<SCRIABIN>",
+        "César Franck": "<FRANCK>",
+        "Wolfgang Amadeus Mozart": "<MOZART>",
+        "Frédéric Chopin": "<CHOPIN>",
+        "Felix Mendelssohn": "<MENDELSSON>",
+        "Franz Liszt": "<LISZT>",
+        "Franz Schubert": "<SCHUBERT>",
+        "Johannes Brahms": "<BRAHMS>",
+        "Joseph Haydn": "<HAYDN>",
+        "Ludwig van Beethoven": "<BEETHOVEN>",
+        "Mily Balakirev": "<BALAKIREV>",
+        "Robert Schumann": "<SCHUMANN>",
+        "Sergei Rachmaninoff": "<RACHMANIOFF>",
+        "Johann Sebastian Bach": "<BACH>",
+    }
+
+    def __init__(self):
+        self.composer_regex_map = self.create_composer_regex_map()
+
+    @property
+    def tokens(self) -> list[str]:
+        return self.dataset_tokens + self.composer_tokens
+
+    def create_composer_regex_map(self) -> dict[re.Pattern, str]:
+        regex_map: dict[re.Pattern, str] = {}
+        for full_name, token in self.composer_token_map.items():
+            names = full_name.split()
+            surname = names[-1]
+            pattern = re.compile(rf"\b{re.escape(surname)}\b", re.IGNORECASE)
+            regex_map[pattern] = token
+        return regex_map
+
+    def get_dataset_token(self, piece_source: dict) -> str:
+        dataset_name = piece_source.get("dataset")
+
+        for dataset_token in self.dataset_tokens:
+            dataset_token_name = dataset_token[1:-1]
+            if dataset_token_name.lower() == dataset_name.lower():
+                return dataset_token
+
+        # FIXME Our internal dataset is the only one without the name
+        # stored as part of the source. This should change with the next
+        # dataset version, then we can add <UNKNOWN_DATASET> here
+        return "<PIANO_FOR_AI>"
+
+    def get_composer_token(self, composer: str) -> str:
+        # TODO This should be more refined - we know that composer
+        # informaion is stored in many ways across different datasets
+        # and we should use that knowledge:
+        # def get_composer_token(dataset_name: str, piece_source: dict): ...
+        matches: list[tuple[re.Match, str]] = [
+            (match, token) for pattern, token in self.composer_regex_map.items() if (match := pattern.search(composer))
+        ]
+
+        if len(matches) == 1:
+            return matches[0][1]
+
+        return "<UNKNOWN_COMPOSER>"