v1.5.1 - fixing --save-all/--save-recycles options

sokrypton · sokrypton · commit b4c1bc7cf89b · 2023-02-06T16:39:27.000-05:00
diff --git a/AlphaFold2.ipynb b/AlphaFold2.ipynb
@@ -46,7 +46,7 @@
       "source": [
         "<img src=\"https://raw.githubusercontent.com/sokrypton/ColabFold/main/.github/ColabFold_Marv_Logo_Small.png\" height=\"200\" align=\"right\" style=\"height:240px\">\n",
         "\n",
-        "##ColabFold v1.5.0: AlphaFold2 using MMseqs2\n",
+        "##ColabFold v1.5.1: AlphaFold2 using MMseqs2\n",
         "\n",
         "Easy to use protein structure and complex prediction using [AlphaFold2](https://www.nature.com/articles/s41586-021-03819-2) and [Alphafold2-multimer](https://www.biorxiv.org/content/10.1101/2021.10.04.463034v1). Sequence alignments/templates are generated through [MMseqs2](mmseqs.com) and [HHsearch](https://github.com/soedinglab/hh-suite). For more details, see <a href=\"#Instructions\">bottom</a> of the notebook, checkout the [ColabFold GitHub](https://github.com/sokrypton/ColabFold) and read our manuscript. \n",
         "Old version: [v1.4](https://colab.research.google.com/github/sokrypton/ColabFold/blob/v1.4.0/AlphaFold2.ipynb)\n",
@@ -393,6 +393,7 @@
         "import glob\n",
         "import matplotlib.pyplot as plt\n",
         "from colabfold.colabfold import plot_plddt_legend\n",
+        "from colabfold.colabfold import pymol_color_list, alphabet_list\n",
         "rank_num = 1 #@param [\"1\", \"2\", \"3\", \"4\", \"5\"] {type:\"raw\"}\n",
         "color = \"lDDT\" #@param [\"chain\", \"lDDT\", \"rainbow\"]\n",
         "show_sidechains = False #@param {type:\"boolean\"}\n",
@@ -414,9 +415,9 @@
         "    view.setStyle({'cartoon': {'color':'spectrum'}})\n",
         "  elif color == \"chain\":\n",
         "    chains = len(queries[0][1]) + 1 if is_complex else 1\n",
-        "    for n,chain,color in zip(range(chains),list(\"ABCDEFGH\"),\n",
-        "                     [\"lime\",\"cyan\",\"magenta\",\"yellow\",\"salmon\",\"white\",\"blue\",\"orange\"]):\n",
-        "      view.setStyle({'chain':chain},{'cartoon': {'color':color}})\n",
+        "    for n,chain,color in zip(range(chains),alphabet_list,pymol_color_list):\n",
+        "       view.setStyle({'chain':chain},{'cartoon': {'color':color}})\n",
+        "\n",
         "  if show_sidechains:\n",
         "    BB = ['C','O','N']\n",
         "    view.addStyle({'and':[{'resn':[\"GLY\",\"PRO\"],'invert':True},{'atom':BB,'invert':True}]},\n",
diff --git a/README.md b/README.md
@@ -1,8 +1,11 @@
-# ColabFold - v1.5.0
+# ColabFold - v1.5.1
 
 ```diff
-+ 04Feb2023: ColabFold updated to use AlphaFold v2.3.1!
++ 04Feb2023: v1.5.0 - ColabFold updated to use AlphaFold v2.3.1!
++ 06Feb2023: v1.5.1 - fixing --save-all/--save-recycles option
 ```
+For details of what was changed in v1.5, see [change log](https://github.com/sokrypton/ColabFold/wiki/v1.5.0)!
+
 <p align="center"><img src="https://github.com/sokrypton/ColabFold/raw/main/.github/ColabFold_Marv_Logo.png" height="250"/></p>
 
 ### Making Protein folding accessible to all via Google Colab!
diff --git a/batch/AlphaFold2_batch.ipynb b/batch/AlphaFold2_batch.ipynb
@@ -35,7 +35,7 @@
         "id": "G4yBrceuFbf3"
       },
       "source": [
-        "#ColabFold: AlphaFold2 w/ MMseqs2 BATCH\n",
+        "#ColabFold v1.5.1: AlphaFold2 w/ MMseqs2 BATCH\n",
         "\n",
         "<img src=\"https://raw.githubusercontent.com/sokrypton/ColabFold/main/.github/ColabFold_Marv_Logo_Small.png\" height=\"256\" align=\"right\" style=\"height:256px\">\n",
         "\n",
diff --git a/colabfold/alphafold/models.py b/colabfold/alphafold/models.py
@@ -20,7 +20,7 @@ def load_models_and_params(
     rank_by: str = "auto",
     max_seq: Optional[int] = None,
     max_extra_seq: Optional[int] = None,
-    use_cluster_profile: Optional[bool] = None,
+    use_cluster_profile: bool = True,
     use_fuse: bool = True,
     use_bfloat16: bool = True,
     use_dropout: bool = False,
@@ -84,8 +84,7 @@ def load_models_and_params(
             if "multimer" in model_suffix:
                 if num_recycles is not None:
                     model_config.model.num_recycle = num_recycles
-                if use_cluster_profile is not None:
-                    model_config.model.embeddings_and_evoformer.use_cluster_profile = use_cluster_profile
+                model_config.model.embeddings_and_evoformer.use_cluster_profile = use_cluster_profile
                 model_config.model.num_ensemble_eval = num_ensemble
             else:
                 if num_recycles is not None:
diff --git a/colabfold/batch.py b/colabfold/batch.py
@@ -403,6 +403,7 @@ def predict_structure(
             if "multimer" in model_type:
                 # TODO: add multimer padding
                 input_features = processed_feature_dict
+                input_features["asym_id"] = input_features["asym_id"] - input_features["asym_id"][...,0]
             else:
                 # TODO: move asym_id processing to "process_features"
                 r = processed_feature_dict["aatype"].shape[0]
@@ -427,28 +428,24 @@ def callback(prediction_result, recycles):
                     print_line += f" {y}={prediction_result[x]:.3g}"
                 logger.info(f"{tag} recycle={recycles}{print_line}")
 
-                if save_recycles or save_all:
-                    prediction_result = _jnp_to_np(prediction_result)
-                    prediction_result["representations"] = prediction_result.pop("prev")
-                
                 if save_recycles:
-                    final_atom_mask = prediction_result["structure_module"]["final_atom_mask"]
-                    b_factors = prediction_result["plddt"][:, None] * final_atom_mask
+                    result = _jnp_to_np(prediction_result)
+                    final_atom_mask = result["structure_module"]["final_atom_mask"]
+                    b_factors = result["plddt"][:, None] * final_atom_mask
                     unrelaxed_protein = protein.from_prediction(features=input_features,
-                        result=prediction_result, b_factors=b_factors,
+                        result=result, b_factors=b_factors,
                         remove_leading_feature_dimension=("ptm" in model_type))
                     
                     unrelaxed_pdb_lines = protein.to_pdb(class_to_np(unrelaxed_protein))
                     files.get("unrelaxed",f"r{recycles}.pdb").write_text(unrelaxed_pdb_lines)
                 
-                if save_all:
-                    with files.get("all",f"r{recycles}.pickle").open("wb") as handle:
-                        pickle.dump(prediction_result, handle)
+                    if save_all:
+                        with files.get("all",f"r{recycles}.pickle").open("wb") as handle:
+                            pickle.dump(result, handle)
 
             prediction_result, recycles = \
             model_runner.predict(input_features, random_seed=seed, prediction_callback=callback)
             prediction_result = _jnp_to_np(prediction_result)
-            prediction_result["representations"] = prediction_result.pop("prev")
             prediction_times.append(time.time() - start)
 
             ########################
@@ -482,19 +479,23 @@ def callback(prediction_result, recycles):
 
             #########################
             # save results
-            #########################            
+            #########################      
+
             # save pdb
             protein_lines = protein.to_pdb(unrelaxed_protein)
             files.get("unrelaxed","pdb").write_text(protein_lines)
             unrelaxed_pdb_lines.append(protein_lines)
 
             # save raw outputs
-            if save_single_representations or save_pair_representations:
-                rep = prediction_result["representations"]
-                if save_single_representations:
-                    np.save(files.get("single_repr","npy"), rep["prev_msa_first_row"])
-                if save_pair_representations:
-                    np.save(files.get("pair_repr","npy"), rep["prev_pair"])
+            if save_all:
+                with files.get("all","pickle").open("wb") as handle:
+                    pickle.dump(prediction_result, handle)
+            if save_single_representations:
+                np.save(files.get("single_repr","npy"),
+                    prediction_result["prev"]["prev_msa_first_row"])
+            if save_pair_representations:
+                np.save(files.get("pair_repr","npy"),
+                    prediction_result["prev"]["prev_pair"])
 
             # write an easy-to-use format (pAE and pLDDT)
             with files.get("scores","json").open("w") as handle:
@@ -1186,6 +1187,7 @@ def run(
     dpi: int = 200,
     max_seq: Optional[int] = None,
     max_extra_seq: Optional[int] = None,
+    use_cluster_profile: bool = True,
     feature_dict_callback: Callable[[Any], Any] = None,
     **kwargs
 ):
@@ -1234,7 +1236,6 @@ def run(
     pair_mode  = old_names.get(pair_mode,pair_mode)
     feature_dict_callback = kwargs.pop("input_features_callback", feature_dict_callback)
     use_dropout           = kwargs.pop("training", use_dropout)
-    use_cluster_profile   = kwargs.pop("use_cluster_profile", None)
     use_fuse              = kwargs.pop("use_fuse", True)
     use_bfloat16          = kwargs.pop("use_bfloat16", True)
     max_msa               = kwargs.pop("max_msa",None)
@@ -1659,7 +1660,7 @@ def main():
         help="rank models by auto, plddt or ptmscore",
         type=str,
         default="auto",
-        choices=["auto", "plddt", "ptmscore", "multimer"],
+        choices=["auto", "plddt", "ptm", "iptm", "multimer"],
     )
     parser.add_argument(
         "--pair-mode",
@@ -1711,6 +1712,12 @@ def main():
         type=str,
         default=None,
     )
+    parser.add_argument(
+        "--disable-cluster-profile",
+        default=False,
+        action="store_true",
+        help="EXPERIMENTAL: for multimer models, disable cluster profiles",
+    )
     parser.add_argument(
         "--zip",
         default=False,
@@ -1798,6 +1805,7 @@ def main():
         max_seq=args.max_seq,
         max_extra_seq=args.max_extra_seq,
         max_msa=args.max_msa,
+        use_cluster_profile=not args.disable_cluster_profile,
         use_gpu_relax = args.use_gpu_relax,
         save_all=args.save_all,
         save_recycles=args.save_recycles,
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "colabfold"
-version = "1.5.0"
+version = "1.5.1"
 description = "Making protein folding accessible to all. Predict proteins structures both in google colab and on your machine"
 authors = [
     "Milot Mirdita <milot.mirdita@mpibpc.mpg.de>",
diff --git a/test-data/complex/3G5O_A_3G5O_B/model_feat.pkl.xz b/test-data/complex/3G5O_A_3G5O_B/model_feat.pkl.xz
diff --git a/test-data/complex_monomer/A_A/model_feat.pkl.xz b/test-data/complex_monomer/A_A/model_feat.pkl.xz