v1.5.0

martinpacesa · Dec 30, 2024 · de45d16 · de45d16
1 parent 50c1532
commit de45d16
Show file tree

Hide file tree

Showing 27 changed files with 651 additions and 44 deletions.
diff --git a/README.md b/README.md
@@ -3,7 +3,9 @@
 
 Simple binder design pipeline using AlphaFold2 backpropagation, MPNN, and PyRosetta. Select your target and let the script do the rest of the work and finish once you have enough designs to order!
 
-[Preprint link for BindCraft](https://www.biorxiv.org/content/10.1101/2024.09.30.615802v1)
+[Take the user experience poll!](https://forms.gle/XsGHDCyHtczVbamPA)
+
+[Preprint link for BindCraft](https://www.biorxiv.org/content/10.1101/2024.09.30.615802)
 
 ## Installation
 First you need to clone this repository. Replace **[install_folder]** with the path where you want to install it.
@@ -72,6 +74,8 @@ rm_template_seq_design          -> remove target template sequence for design (i
 rm_template_seq_predict         -> remove target template sequence for reprediction (increases target flexibility)
 rm_template_sc_design           -> remove sidechains from target template for design
 rm_template_sc_predict          -> remove sidechains from target template for reprediction
+predict_initial_guess           -> Introduce bias by providing binder atom positions as a starting point for prediction. Recommended if designs fail after MPNN optimization.
+predict_bigbang                 -> Introduce atom position bias into the structure module for atom initilisation. Recommended if target and design are large (more than 600 amino acids).
 
 # Design iterations
 soft_iterations                 -> number of soft iterations (all amino acids considered at all positions)
@@ -100,6 +104,7 @@ use_rg_loss                     -> use radius of gyration loss?
 weights_rg                      -> Design weight - radius of gyration weight for binder
 use_termini_distance_loss       -> Try to minimise distance between N- and C-terminus of binder? Helpful for grafting
 weights_termini_loss            -> Design weight - N- and C-terminus distance minimisation weight of binder
+cyclize_peptide                 -> Make the binder/peptide design cyclic
 
 # MPNN settings
 mpnn_fix_interface              -> whether to fix the interface designed in the starting trajectory

diff --git a/bindcraft.py b/bindcraft.py
@@ -58,6 +58,9 @@
 ####################################
 ### initialise PyRosetta
 pr.init(f'-ignore_unrecognized_res -ignore_zero_occupancy -mute all -holes:dalphaball {advanced_settings["dalphaball_path"]} -corrections::beta_nov16 true -relax:default_repeats 1')
+print(f"Running binder design for target {settings_file}")
+print(f"Design settings used: {advanced_file}")
+print(f"Filtering designs based on {filters_file}")
 
 ####################################
 # initialise counters
@@ -119,7 +122,7 @@
         print("")
 
         # Proceed if there is no trajectory termination signal
-        if trajectory.aux["log"]['terminate'] == "":
+        if trajectory.aux["log"]["terminate"] == "":
             # Relax binder to calculate statistics
             trajectory_relaxed = os.path.join(design_paths["Trajectory/Relaxed"], design_name + ".pdb")
             pr_relax(trajectory_pdb, trajectory_relaxed)
@@ -194,9 +197,13 @@
                     clear_mem()
                     # compile complex prediction model
                     complex_prediction_model = mk_afdesign_model(protocol="binder", num_recycles=advanced_settings["num_recycles_validation"], data_dir=advanced_settings["af_params_dir"], 
-                                                                use_multimer=multimer_validation)
-                    complex_prediction_model.prep_inputs(pdb_filename=target_settings["starting_pdb"], chain=target_settings["chains"], binder_len=length, rm_target_seq=advanced_settings["rm_template_seq_predict"],
-                                                        rm_target_sc=advanced_settings["rm_template_sc_predict"])
+                                                                use_multimer=multimer_validation, use_initial_guess=advanced_settings["predict_initial_guess"], use_initial_atom_pos=advanced_settings["predict_bigbang"])
+                    if advanced_settings["predict_initial_guess"] or advanced_settings["predict_bigbang"]:
+                        complex_prediction_model.prep_inputs(pdb_filename=trajectory_pdb, chain='A', binder_chain='B', binder_len=length, use_binder_template=True, rm_target_seq=advanced_settings["rm_template_seq_predict"],
+                                                            rm_target_sc=advanced_settings["rm_template_sc_predict"], rm_template_ic=True)
+                    else:
+                        complex_prediction_model.prep_inputs(pdb_filename=target_settings["starting_pdb"], chain=target_settings["chains"], binder_len=length, rm_target_seq=advanced_settings["rm_template_seq_predict"],
+                                                            rm_target_sc=advanced_settings["rm_template_sc_predict"])
 
                     # compile binder monomer prediction model
                     binder_prediction_model = mk_afdesign_model(protocol="hallucination", use_templates=False, initial_guess=False, 
@@ -221,7 +228,7 @@
                             save_fasta(mpnn_design_name, mpnn_sequence['seq'], design_paths)
 
                         ### Predict mpnn redesigned binder complex using masked templates
-                        mpnn_complex_statistics, pass_af2_filters = masked_binder_predict(complex_prediction_model,
+                        mpnn_complex_statistics, pass_af2_filters = predict_binder_complex(complex_prediction_model,
                                                                                         mpnn_sequence['seq'], mpnn_design_name,
                                                                                         target_settings["starting_pdb"], target_settings["chains"],
                                                                                         length, trajectory_pdb, prediction_models, advanced_settings,
@@ -452,4 +459,4 @@
 ### Script finished
 elapsed_time = time.time() - script_start_time
 elapsed_text = f"{'%d hours, %d minutes, %d seconds' % (int(elapsed_time // 3600), int((elapsed_time % 3600) // 60), int(elapsed_time % 60))}"
-print("Finished all designs. Script execution for "+str(trajectory_n)+" trajectories took: "+elapsed_text)
+print("Finished all designs. Script execution for "+str(trajectory_n)+" trajectories took: "+elapsed_text)
diff --git a/bindcraft.slurm b/bindcraft.slurm
@@ -37,8 +37,11 @@ while true ; do
     esac
 done
 
+# Ensure that SETTINGS is not empty
+if [ -z "$SETTINGS" ]; then
+    echo "Error: The -s or --settings option is required."
+    exit 1
+fi
+
 echo "Running the BindCraft pipeline"
-echo "Running binder design for target ${SETTINGS}"
-echo "Design settings used: ${ADVANCED}"
-echo "Filtering designs based on ${FILTERS}"
 python -u "${SCRIPT_DIR}/bindcraft.py" --settings "${SETTINGS}" --filters "${FILTERS}" --advanced "${ADVANCED}"
diff --git a/functions/__init__.py b/functions/__init__.py
@@ -17,4 +17,4 @@
 #os.environ["SLURM_STEP_NODELIST"] = os.environ["SLURM_NODELIST"]
 warnings.simplefilter(action='ignore', category=FutureWarning)
 warnings.simplefilter(action='ignore', category=DeprecationWarning)
-warnings.simplefilter(action='ignore', category=BiopythonWarning)
+warnings.simplefilter(action='ignore', category=BiopythonWarning)
diff --git a/functions/colabdesign_utils.py b/functions/colabdesign_utils.py
@@ -236,7 +236,7 @@ def binder_hallucination(design_name, starting_pdb, chain, target_hotspot_residu
     return af_model
 
 # run prediction for binder with masked template target
-def masked_binder_predict(prediction_model, binder_sequence, mpnn_design_name, target_pdb, chain, length, trajectory_pdb, prediction_models, advanced_settings, filters, design_paths, failure_csv, seed=None):
+def predict_binder_complex(prediction_model, binder_sequence, mpnn_design_name, target_pdb, chain, length, trajectory_pdb, prediction_models, advanced_settings, filters, design_paths, failure_csv, seed=None):
     prediction_stats = {}
 
     # clean sequence
@@ -246,6 +246,10 @@ def masked_binder_predict(prediction_model, binder_sequence, mpnn_design_name, t
     pass_af2_filters = True
     filter_failures = {}
 
+    if advanced_settings["cyclize_peptide"]:
+        # make macrocycle peptide
+        add_cyclic_offset(prediction_model)
+
     # start prediction per AF2 model, 2 are used by default due to masked templates
     for model_num in prediction_models:
         # check to make sure prediction does not exist already
@@ -313,6 +317,10 @@ def predict_binder_alone(prediction_model, binder_sequence, mpnn_design_name, le
     binder_sequence = re.sub("[^A-Z]", "", binder_sequence.upper())
     prediction_model.set_seq(binder_sequence)
 
+    if advanced_settings["cyclize_peptide"]:
+        # make macrocycle peptide
+        add_cyclic_offset(prediction_model)
+
     # predict each model separately
     for model_num in prediction_models:
         # check to make sure prediction does not exist already

diff --git a/functions/generic_utils.py b/functions/generic_utils.py
@@ -303,7 +303,7 @@ def save_fasta(design_name, sequence, design_paths):
 def clean_pdb(pdb_file):
     # Read the pdb file and filter relevant lines
     with open(pdb_file, 'r') as f_in:
-        relevant_lines = [line for line in f_in if line.startswith(('ATOM', 'HETATM', 'MODEL', 'TER', 'END'))]
+        relevant_lines = [line for line in f_in if line.startswith(('ATOM', 'HETATM', 'MODEL', 'TER', 'END', 'LINK'))]
 
     # Write the cleaned lines back to the original pdb file
     with open(pdb_file, 'w') as f_out:

diff --git a/notebooks/BindCraft.ipynb b/notebooks/BindCraft.ipynb
@@ -165,9 +165,11 @@
     "# @markdown ---\n",
     "# @markdown Which binder design protocol to run? Default is recommended. \"Beta-sheet\" promotes the design of more beta sheeted proteins, but requires more sampling. \"Peptide\" is optimised for helical peptide binders.\n",
     "design_protocol = \"Default\" # @param [\"Default\",\"Beta-sheet\",\"Peptide\"]\n",
-    "# @markdown What interface design method to use?. \"AlphaFold2\" is the default, interface is generated by AlphaFold2. \"MPNN\" uses soluble MPNN to optimise the interface, but majority of residues still originate from AlphaFold2.\n",
+    "# @markdown What prediction protocol to use?. \"Default\" performs single sequence prediction of the binder. \"HardTarget\" uses initial guess to improve complex prediction for difficult targets, but might introduce some bias.\n",
+    "prediction_protocol = \"Default\" # @param [\"Default\",\"HardTarget\"]\n",
+    "# @markdown What interface design method to use?. \"AlphaFold2\" is the default, interface is generated by AlphaFold2. \"MPNN\" uses soluble MPNN to optimise the interface.\n",
     "interface_protocol = \"AlphaFold2\" # @param [\"AlphaFold2\",\"MPNN\"]\n",
-    "# @markdown What target template protocol to use? \"Default\" allows for limited amount flexibility. \"Masked\" allows for greater target flexibility on both sidechain and backbone level, but might result in reduced experimental success rates.\n",
+    "# @markdown What target template protocol to use? \"Default\" allows for limited amount flexibility. \"Masked\" allows for greater target flexibility on both sidechain and backbone level.\n",
     "template_protocol = \"Default\" # @param [\"Default\",\"Masked\"]\n",
     "# @markdown ---\n",
     "\n",
@@ -194,6 +196,16 @@
     "else:\n",
     "    raise ValueError(f\"Unsupported template protocol\")\n",
     "\n",
+    "if design_protocol in [\"Peptide\"]:\n",
+    "    prediction_protocol_tag = \"\"\n",
+    "else:\n",
+    "    if prediction_protocol == \"Default\":\n",
+    "        prediction_protocol_tag = \"\"\n",
+    "    elif prediction_protocol == \"HardTarget\":\n",
+    "        prediction_protocol_tag = \"_hardtarget\"\n",
+    "    else:\n",
+    "        raise ValueError(f\"Unsupported prediction protocol\")\n",
+    "\n",
     "advanced_settings_path = \"/content/bindcraft/settings_advanced/\" + design_protocol_tag + interface_protocol_tag + template_protocol_tag + \".json\"\n",
     "\n",
     "currenttime = datetime.now().strftime(\"%Y-%m-%d %H:%M:%S\")\n",
@@ -212,7 +224,7 @@
     "#@title Filters\n",
     "# @markdown ---\n",
     "# @markdown Which filters for designs to use? \"Default\" are recommended, \"Peptide\" are for the design of peptide binders, \"Relaxed\" are more permissive but may result in fewer experimental successes, \"Peptide_Relaxed\" are more permissive filters for non-helical peptides, \"None\" is for benchmarking.\n",
-    "filter_option = \"Peptide\" # @param [\"Default\", \"Peptide\", \"Relaxed\", \"Peptide_Relaxed\", \"None\"]\n",
+    "filter_option = \"Default\" # @param [\"Default\", \"Peptide\", \"Relaxed\", \"Peptide_Relaxed\", \"None\"]\n",
     "# @markdown ---\n",
     "\n",
     "if filter_option == \"Default\":\n",

diff --git a/settings_advanced/betasheet_4stage_multimer.json b/settings_advanced/betasheet_4stage_multimer.json
@@ -8,6 +8,8 @@
     "rm_template_seq_predict": false,
     "rm_template_sc_design": false,
     "rm_template_sc_predict": false,
+    "predict_initial_guess": false,
+    "predict_bigbang": false,
     "soft_iterations": 75,
     "temporary_iterations": 45,
     "hard_iterations": 5,
@@ -23,7 +25,7 @@
     "intra_contact_distance": 14.0,
     "inter_contact_distance": 20.0,
     "intra_contact_number": 2,
-    "inter_contact_number": 1,
+    "inter_contact_number": 2,
     "weights_helicity": -2.0,
     "random_helicity": false,
     "use_i_ptm_loss": true,
@@ -32,6 +34,7 @@
     "weights_rg": 0.3,
     "use_termini_distance_loss": false,
     "weights_termini_loss": 0.1,
+    "cyclize_peptide": false,
     "enable_mpnn": true,
     "mpnn_fix_interface": true,
     "num_seqs": 20,
@@ -57,7 +60,7 @@
     "max_trajectories": false,
     "enable_rejection_check": true,
     "acceptance_rate": 0.01,
-    "start_monitoring": 200,
+    "start_monitoring": 600,
     "af_params_dir": "",
     "dssp_path": "",
     "dalphaball_path": ""

diff --git a/settings_advanced/betasheet_4stage_multimer_flexible.json b/settings_advanced/betasheet_4stage_multimer_flexible.json
@@ -8,6 +8,8 @@
     "rm_template_seq_predict": true,
     "rm_template_sc_design": false,
     "rm_template_sc_predict": false,
+    "predict_initial_guess": false,
+    "predict_bigbang": false,
     "soft_iterations": 75,
     "temporary_iterations": 45,
     "hard_iterations": 5,
@@ -23,7 +25,7 @@
     "intra_contact_distance": 14.0,
     "inter_contact_distance": 20.0,
     "intra_contact_number": 2,
-    "inter_contact_number": 1,
+    "inter_contact_number": 2,
     "weights_helicity": -2.0,
     "random_helicity": false,
     "use_i_ptm_loss": true,
@@ -32,6 +34,7 @@
     "weights_rg": 0.3,
     "use_termini_distance_loss": false,
     "weights_termini_loss": 0.1,
+    "cyclize_peptide": false,
     "enable_mpnn": true,
     "mpnn_fix_interface": true,
     "num_seqs": 20,
@@ -57,7 +60,7 @@
     "max_trajectories": false,
     "enable_rejection_check": true,
     "acceptance_rate": 0.01,
-    "start_monitoring": 200,
+    "start_monitoring": 600,
     "af_params_dir": "",
     "dssp_path": "",
     "dalphaball_path": ""

diff --git a/settings_advanced/betasheet_4stage_multimer_flexible_hardtarget.json b/settings_advanced/betasheet_4stage_multimer_flexible_hardtarget.json
@@ -0,0 +1,67 @@
+{
+    "omit_AAs": "C",
+    "force_reject_AA": false,
+    "use_multimer_design": true,
+    "design_algorithm": "4stage",
+    "sample_models": true,
+    "rm_template_seq_design": true,
+    "rm_template_seq_predict": true,
+    "rm_template_sc_design": false,
+    "rm_template_sc_predict": false,
+    "predict_initial_guess": true,
+    "predict_bigbang": false,
+    "soft_iterations": 75,
+    "temporary_iterations": 45,
+    "hard_iterations": 5,
+    "greedy_iterations": 15,
+    "greedy_percentage": 5,
+    "save_design_animations": true,
+    "save_design_trajectory_plots": true,
+    "weights_plddt": 0.15,
+    "weights_pae_intra": 0.4,
+    "weights_pae_inter": 0.1,
+    "weights_con_intra": 0.4,
+    "weights_con_inter": 0.5,
+    "intra_contact_distance": 14.0,
+    "inter_contact_distance": 20.0,
+    "intra_contact_number": 2,
+    "inter_contact_number": 2,
+    "weights_helicity": -2.0,
+    "random_helicity": false,
+    "use_i_ptm_loss": true,
+    "weights_iptm": 0.05,
+    "use_rg_loss": true,
+    "weights_rg": 0.3,
+    "use_termini_distance_loss": false,
+    "weights_termini_loss": 0.1,
+    "cyclize_peptide": false,
+    "enable_mpnn": true,
+    "mpnn_fix_interface": true,
+    "num_seqs": 20,
+    "max_mpnn_sequences": 2,
+    "sampling_temp": 0.1,
+    "backbone_noise": 0.00,
+    "model_path": "v_48_020",
+    "mpnn_weights": "soluble",
+    "save_mpnn_fasta": false,
+    "num_recycles_design": 1,
+    "num_recycles_validation": 3,
+    "optimise_beta": true,
+    "optimise_beta_extra_soft": 0,
+    "optimise_beta_extra_temp": 0,
+    "optimise_beta_recycles_design": 3,
+    "optimise_beta_recycles_valid": 3,
+    "remove_unrelaxed_trajectory": true,
+    "remove_unrelaxed_complex": true,
+    "remove_binder_monomer": true,
+    "zip_animations": true,
+    "zip_plots": true,
+    "save_trajectory_pickle": false,
+    "max_trajectories": false,
+    "enable_rejection_check": true,
+    "acceptance_rate": 0.01,
+    "start_monitoring": 600,
+    "af_params_dir": "",
+    "dssp_path": "",
+    "dalphaball_path": ""
+}
diff --git a/settings_advanced/betasheet_4stage_multimer_hardtarget.json b/settings_advanced/betasheet_4stage_multimer_hardtarget.json
@@ -0,0 +1,67 @@
+{
+    "omit_AAs": "C",
+    "force_reject_AA": false,
+    "use_multimer_design": true,
+    "design_algorithm": "4stage",
+    "sample_models": true,
+    "rm_template_seq_design": false,
+    "rm_template_seq_predict": false,
+    "rm_template_sc_design": false,
+    "rm_template_sc_predict": false,
+    "predict_initial_guess": true,
+    "predict_bigbang": false,
+    "soft_iterations": 75,
+    "temporary_iterations": 45,
+    "hard_iterations": 5,
+    "greedy_iterations": 15,
+    "greedy_percentage": 5,
+    "save_design_animations": true,
+    "save_design_trajectory_plots": true,
+    "weights_plddt": 0.15,
+    "weights_pae_intra": 0.4,
+    "weights_pae_inter": 0.1,
+    "weights_con_intra": 0.4,
+    "weights_con_inter": 0.5,
+    "intra_contact_distance": 14.0,
+    "inter_contact_distance": 20.0,
+    "intra_contact_number": 2,
+    "inter_contact_number": 2,
+    "weights_helicity": -2.0,
+    "random_helicity": false,
+    "use_i_ptm_loss": true,
+    "weights_iptm": 0.05,
+    "use_rg_loss": true,
+    "weights_rg": 0.3,
+    "use_termini_distance_loss": false,
+    "weights_termini_loss": 0.1,
+    "cyclize_peptide": false,
+    "enable_mpnn": true,
+    "mpnn_fix_interface": true,
+    "num_seqs": 20,
+    "max_mpnn_sequences": 2,
+    "sampling_temp": 0.1,
+    "backbone_noise": 0.00,
+    "model_path": "v_48_020",
+    "mpnn_weights": "soluble",
+    "save_mpnn_fasta": false,
+    "num_recycles_design": 1,
+    "num_recycles_validation": 3,
+    "optimise_beta": true,
+    "optimise_beta_extra_soft": 0,
+    "optimise_beta_extra_temp": 0,
+    "optimise_beta_recycles_design": 3,
+    "optimise_beta_recycles_valid": 3,
+    "remove_unrelaxed_trajectory": true,
+    "remove_unrelaxed_complex": true,
+    "remove_binder_monomer": true,
+    "zip_animations": true,
+    "zip_plots": true,
+    "save_trajectory_pickle": false,
+    "max_trajectories": false,
+    "enable_rejection_check": true,
+    "acceptance_rate": 0.01,
+    "start_monitoring": 600,
+    "af_params_dir": "",
+    "dssp_path": "",
+    "dalphaball_path": ""
+}