Module PDAnalysis.deformation
Expand source code
from pathlib import Path
import numpy as np
import pandas as pd
from .protein import Protein, AverageProtein
from .utils import rotate_points, get_shared_indices, get_mutation_position
class Deformation:
""" Deformation object description:
The Deformation object takes two protein objects (Protein, AverageProtein),
and calculates a range of deformation metrics.
Methods
-------
Effective Strain : strain
mean relative change in distance between neighbors in two neighborhoods
[McBride, J. M., Polev, K., Abdirasulov, A., Reinharz, V., Grzybowski, B. A.,
& Tlusty, T. (2023), 'AlphaFold2 can predict single-mutation effects', biorXiv]
Shear Strain : shear
magnitude of the off-diagonal components of the strain tensor
[Eckmann, J P, J. Rougemont, and T. Tlusty (2019), “Colloquium: Proteins:
The physics of amorphous evolving matter,” Rev. Mod. Phys. 91, 031001]
Non-Affine Strain : non_affine
non-linear component of strain, defined by the residual of the fit of the strain tensor
to two neighborhood tensors
[Falk, M L, and J. S. Langer (1998), 'Dynamics of viscoplastic deformation in
amorphous solids', Phys. Rev. E 57, 7192–7205]
Local Distance Difference (LDD) : ldd
L2 norm of the difference between distances between neighbors in two neighborhoods
[McBride, J. M., Polev, K., Abdirasulov, A., Reinharz, V., Grzybowski, B. A.,
& Tlusty, T. (2023), 'AlphaFold2 can predict single-mutation effects', biorXiv]
Local Distance Difference Test (LDDT) : lddt
fraction of differences between distances between neighbors that are within some
set thresholds in two neighborhoods
[Mariani, Valerio, Marco Biasini, Alessandro Barbato, and Torsten Schwede (2013),
'lDDT: a local superposition-free score for comparing protein structures and models
using distance difference tests', Bioinformatics 29 (21), 2722–2728]
Neighborhood Distance : neighborhood_dist
L2 norm of the difference between two neighborhood tensors
[McBride, J. M., Polev, K., Abdirasulov, A., Reinharz, V., Grzybowski, B. A.,
& Tlusty, T. (2023), 'AlphaFold2 can predict single-mutation effects', biorXiv]
Root Mean Square Deviation (RMSD) : rmsd
L2 norm of the difference between two protein structures
Distance to Mutated Site : mut_dist
Distance (angstroms) of each residue to the nearest mutated site
Attributes
----------
all_methods : list
list of all methods:
['mut_dist', 'strain', 'shear', 'non_affine', 'ldd',
'lddt', 'neighborhood_dist', 'rmsd']
sub_pos : list(int)
list of sequence indices where sequences of protein_1 and protein_2 are different;
indices start from zero
sub_str : list(str)
sequence substitutions written in the format {amino_acid_1}{index}{amino_acid_2};
indices start from one (as is convention)
"""
def __init__(self, protein_1, protein_2, **kwargs):
"""
args
----------
protein_1: (Protein, AverageProtein)
protein_2: (Protein, AverageProtein)
kwargs
----------
method : (str, list, set, tuple, np.ndarray) : default = ["strain"]
method(s) to be used when calling self.run();
'all' results in all methods being used;
multiple methods can be passed as an iterable object
lddt_cutoffs : list : default = [0.5, 1, 2, 4]
specify a list of thresholds used to calculate LDDT
neigh_cut : float : default = 13.0
cutoff radius used to determine neighborhood;
can be used to update Protein and AverageProtein objects
force_cutoff : bool : default = False
recalculate Protein and AverageProtein neighborhoods if
Deformation.neigh_cut != Protein.neigh_cut
force_norm : bool : default = False
force a metric that is not normally normalized by the number
of neighbors to be normalized in this way
force_nonorm : bool : default = False
force a metric that is normally normalized by the number
of neighbors to not be normalized in this way
force_relative : bool : default = False
force a metric that is not normally normalized by neighbor
distance to be normalized in this way
force_absolute : bool : default = False
force a metric that is normally normalized by neighbor
distance to not be normalized in this way
force_nonorm : bool : default = False
force a metric that is normally normalized by the number
of neighbors to not be normalized in this way
verbose : bool : default = True
print a summary of the deformation calculation
"""
self.prot1 = protein_1
self.prot2 = protein_2
self.proteins = [self.prot1, self.prot2]
self.default_method = ["strain"]
self.all_methods = ["mut_dist", "strain", "shear", "non_affine", "ldd", "lddt", "neighborhood_dist", "rmsd"]
self.lddt_cutoffs = kwargs.get("lddt_cutoffs", [0.5, 1, 2, 4])
self.method = kwargs.get('method', self.default_method.copy())
self.neigh_cut = kwargs.get('neigh_cut', 13.0)
self.force_cutoff = kwargs.get('force_cutoff', False)
self.force_norm = kwargs.get('force_norm', False)
self.force_nonorm = kwargs.get('force_nonorm', False)
self.force_relative = kwargs.get('force_relative', False)
self.force_absolute = kwargs.get('force_absolute', False)
self.verbose = kwargs.get('verbose', True)
self.sub_pos = None
self.sub_str = ''
self._parse_input()
self._parse_method()
if self.verbose:
self._print_inputs_summary()
def _print_inputs_summary(self):
"""Print summary of the deformation calculation"""
print(f"Comparing {self.prot1} with {self.prot2}.")
print(f"Sequence length :: {self.prot1.seq_len}")
nmiss1 = sum([len(i) == 0 for i in self.prot1.neigh_idx])
nmiss2 = sum([len(i) == 0 for i in self.prot2.neigh_idx])
print(f"Number of residues excluded due to missing coordinates, or due to low pLDDT" + \
f" / high B-factor ::\n\tProtA, {nmiss1}\n\tProtB, {nmiss2}")
print(f"Amino acid substitutions :: {' '.join(self.sub_str)}")
print(f"Methods to run :: {' '.join(self.method)}")
def _parse_input(self):
"""
Accepts protein (Protein, AverageProtein) objects as input.
Checks protein objects for consistency in neighborhood cutoff radii,
protein size, and checks sequences for differences (mutations).
"""
# Check input types
for prot in self.proteins:
if not isinstance(prot, (AverageProtein, Protein)):
raise Exception(f"Input object type {type(prot)} is not supported.")
# Check that neighbor cutoff definitions are consistent.
# This method also loads neighborhoods if they are not loaded already.
self._check_neighborhoods()
# Check that protein coordinate arrays are the same length
l1 = len(self.prot1.neigh_idx)
l2 = len(self.prot2.neigh_idx)
if l1 != l2:
raise Exception("Protein coordinate arrays are not the same length: " + \
f"Protein A has {l1} residues, while " + \
f"Protein B has {l2} residues.\n" + \
"If using PDB files with missing coordinates, use the --fix_pdb option.")
try:
self.sub_pos = get_mutation_position(self.prot1.sequence, self.prot2.sequence)
self.sub_str = self._get_substitution_strings()
except AttributeError as E:
raise AttributeError("Sequence is not defined for Protein object")
def _get_substitution_strings(self):
"""Get conventional representation of mutation as a string"""
return [f"{self.prot1.sequence[i]}{i+1}{self.prot2.sequence[i]}" for i in self.sub_pos]
def _update_protein_neighborhood(self, prot, neigh_cut):
"""Check Protein neighbood and recalculate if neigh_cut is wrong"""
# If not calculated yet... calculate
if len(prot.neigh_idx) == 0:
prot.neigh_cut = neigh_cut
prot.get_local_neighborhood()
# If neigh cut is wrong... calculate
elif prot.neigh_cut != neigh_cut:
print(f"Recalculating Protein with new neighbor cutoff = {neigh_cut}")
prot.neigh_cut = neigh_cut
prot.get_local_neighborhood()
def _update_averageProtein_neighborhood(self, prot, neigh_cut):
"""Check AverageProtein neighbood and recalculate if neigh_cut is wrong"""
# If not calculated yet... calculate
if len(prot.neigh_idx) == 0:
prot.neigh_cut = neigh_cut
prot.get_average_structure()
# If neigh cut is wrong... calculate
elif prot.neigh_cut != neigh_cut:
print(f"Recalculating AverageProtein with new neighbor cutoff = {neigh_cut}")
prot.neigh_cut = neigh_cut
prot.recalculate_average_structure()
# Check for consistency in the use of neighbor cutoffs
def _check_neighborhoods(self):
"""
Run some consistency checks on the two (Protein, AverageProtein) objects.
Recalculate neighborhoods if parameters (neigh_cut, min_plddt, max_bfactor) have changed.
"""
neigh_cut = [prot.neigh_cut for prot in self.proteins]
# Check for consistency between Protein objects.
# If they are inconsistent, either force them to use Deformation.neigh_cutoff,
# or exit
if neigh_cut[0] != neigh_cut[1]:
if self.force_cutoff:
for prot in self.proteins:
if isinstance(prot, Protein):
self._update_protein_neighborhood(prot, self.neigh_cut)
if isinstance(prot, AverageProtein):
self._update_averageProtein_neighborhood(prot, self.neigh_cut)
else:
raise Exception("AverageProtein / Protein objects were created with different neighbor cutoffs!" + \
"\n\tYou need to use the same neighbor cutoff for each structure," + \
"\n\tor to automatically recalculate neighborhoods using Deformation.neigh_cutoff," +\
" use Deformation(..., force_cutoff=True)")
return
# If not "force_cutoff", then ensure Deformation.neigh_cutoff equals that of the Protein objects
if not self.force_cutoff:
if self.neigh_cut != neigh_cut[0]:
self.neigh_cut = neigh_cut[0]
print(f"WARNING! Resetting neighbour cutoff to {self.neigh_cut}, since this " + \
"value was used for the AverageProtein structure." + \
"\n\tTo override this, use Deformation(..., force_cutoff=True)")
for i, prot in enumerate(self.proteins):
if isinstance(prot, Protein):
self._update_protein_neighborhood(prot, self.neigh_cut)
if isinstance(prot, AverageProtein):
self._update_averageProtein_neighborhood(prot, self.neigh_cut)
if self.force_cutoff:
for i, prot in enumerate(self.proteins):
prot.neigh_cut = self.neigh_cut
prot.recalculate_average_structure()
else:
self.neigh_cut = neigh_cut[0]
# Parse method, and ensure methods are acceptable
def _parse_method(self):
"""Parse method into the appropriate format"""
if isinstance(self.method, str):
self.method = [self.method]
if isinstance(self.method, (list, np.ndarray, set, tuple)):
if len(self.method) == 1:
if self.method[0] == 'all':
self.method = self.all_methods.copy()
else:
method_list = []
for method in self.method:
if method in self.all_methods:
method_list.append(method)
else:
print(f"WARNING! {method} is not an accepted method")
self.method = method_list
else:
self.method = []
if len(self.method) == 0:
raise Exception("No acceptable method found!" + \
f"\nChoose one out of: all, {', '.join(self.all_methods)}")
# Set method, and run through parse to check method validity
def set_method(self, value):
"""Set the method(s) to be used by self.run()
> (str, list, set, tuple, np.ndarray)
> 'all' results in all methods being used;
> multiple methods can be passed as an iterable object
> list of all methods:
> 'mut_dist'
> 'strain'
> 'shear'
> 'non_affine'
> 'ldd'
> 'lddt'
> 'neighborhood_dist'
> 'rmsd'
"""
self.method = value
self._parse_method()
def save_output(self, path_out):
"""Save outputs to CSV file"""
# Load any deformation that was calculated
deform = {}
for m in self.all_methods:
if hasattr(self, m):
if m != 'rmsd':
deform[m] = getattr(self, m)
else:
deform[m] = self.rmsd_per_residue
# Only save output if deformation was calculated
if not len(deform):
raise Exception("ERROR! Cannot save output if there is none!")
# Add sequence indices, residue names
else:
type_names = ['residue_index', 'protA_resname', 'protB_resname']
res_data = [np.arange(len(self.prot1.sequence)) + 1, self.prot1.sequence, self.prot2.sequence]
output = {k: d for k, d in zip(type_names, res_data) if not isinstance(d, type(None))}
########################################
### NEED TO ADD NUMBER OF NEIGHBORS
########################################
output.update(deform)
pd.DataFrame(output).to_csv(path_out, index=False)
def run(self):
"""
Runs through all of the methods in self.method.
The output of each method, {m}, is stored in self.{m} (e.g. self.strain);
RMSD is stored as self.rmsd (whole protein), and as RMSD per residue, self.rmsd_per_residue
"""
# Calculate deformation based on the specified method
for method in self.method:
self._run_analysis(method)
def _run_analysis(self, method):
"""Run the analysis code for a particular method"""
analyses = {'mut_dist': self.calculate_dist_from_mutation,
'strain': self.calculate_strain,
'shear': self.calculate_shear,
'non_affine': self.calculate_non_affine,
'ldd': self.calculate_ldd,
'lddt': self.calculate_lddt,
'neighborhood_dist': self.calculate_neighborhood_dist,
'rmsd': self.calculate_rmsd}
analyses[method]()
def _get_shared_indices(self):
"""Get shared indices between two neighborhoods"""
self.shared_indices = []
for i in range(self.prot1.seq_len):
# If no data for residue, shared indices is empty
if (not len(self.prot1.neigh_idx[i])) | (not len(self.prot2.neigh_idx[i])):
self.shared_indices.append(([], []))
else:
self.shared_indices.append(get_shared_indices(self.prot1.neigh_idx[i], self.prot2.neigh_idx[i]))
# Calculate distance from closest mutation
def calculate_dist_from_mutation(self):
"""Calcualte distance from the nearest mutated residue"""
# If none differ, then return np.nan
if not len(self.sub_pos):
print("WARNING! Trying to calculate distance from mutation, while comparing identical sequences")
return np.zeros(self.prot1.seq_len) * np.nan
# Calculate mindist using the full array (inc. nan)
mut_dist1 = self.prot1.dist_mat[:,self.sub_pos]
mut_dist2 = self.prot2.dist_mat[:,self.sub_pos]
# Average the distance across both structures,
# and get the minimum distance per residue to a mutated position
self.mut_dist = np.nanmin(0.5 * (mut_dist1 + mut_dist2), axis=1)
def _calculate_deformation(self, deformation_method):
"""Find shared residue indices between two neighborhoods
and calculate deformation per residue"""
deformation = np.zeros(self.prot1.seq_len, float) * np.nan
if not hasattr(self, "shared_indices"):
self._get_shared_indices()
kwargs = {arg: getattr(self, arg) for arg in ["force_relative", "force_norm", "force_absolute", "force_nonorm"]}
for i in range(self.prot1.seq_len):
# Get shared indices
i1, i2 = self.shared_indices[i]
# If no shared indices, leave np.nan
if not len(i1):
continue
deformation[i] = deformation_method(self.prot1.neigh_tensor[i][i1], self.prot2.neigh_tensor[i][i2], **kwargs)
return deformation
def _calculate_lddt_residue(self, neigh_tensor1, neigh_tensor2, **kwargs):
"""Calculate LDDT given a pair of neighborhood tensors"""
# Get local distance vectors
v1 = np.linalg.norm(neigh_tensor1, axis=1)
v2 = np.linalg.norm(neigh_tensor2, axis=1)
# Get local distance difference vector
dv = v2 - v1
return np.sum([np.sum(dv<=cut) for cut in self.lddt_cutoffs]) / (len(self.lddt_cutoffs) * len(dv))
def calculate_lddt(self):
"""Calculate LDDT"""
self.lddt = self._calculate_deformation(self._calculate_lddt_residue)
def _calculate_ldd_residue(self, neigh_tensor1, neigh_tensor2, **kwargs):
"""Calculate LDD given a pair of neighborhood tensors"""
# Get local distance vectors
v1 = np.linalg.norm(neigh_tensor1, axis=1)
v2 = np.linalg.norm(neigh_tensor2, axis=1)
# Get local distance difference vector
dv = v2 - v1
if force_relative:
# Normalize LDD by distance
dv = dv / v1
# Calculate local distance difference
if force_norm:
# Normalize LDD by number of neighbors
return np.linalg.norm(dv) / len(i1)
else:
return np.linalg.norm(dv)
def calculate_ldd(self):
"""Calculate LDD"""
self.ldd = self._calculate_deformation(self._calculate_ldd_residue)
def _calculate_nd_residue(self, neigh_tensor1, neigh_tensor2, **kwargs):
"""Calculate neighborhood distance given a pair of neighborhood tensors"""
# Rotate neighbourhood tensor and calculate Euclidean distance
nd = np.linalg.norm(rotate_points(neigh_tensor2, neigh_tensor1) - neigh_tensor1)
if self.force_norm:
return nd / len(i1)
else:
return nd
def calculate_neighborhood_dist(self):
"""Calculate neighborhood_distance"""
self.neighbor_distance = self._calculate_deformation(self._calculate_nd_residue)
def _calculate_shear_residue(self, neigh_tensor1, neigh_tensor2, **kwargs):
"""Calculate shear strain given a pair of neighborhood tensors"""
u1 = neigh_tensor1
u2 = neigh_tensor2
try:
duu = u2 @ u2.T - u1 @ u1.T
uu = np.linalg.inv(u1.T @ u1)
C = 0.5 * (uu @ u1.T @ duu @ u1 @ uu)
return 0.5 * np.sum(np.diag(C@C) - np.diag(C)**2)
except Exception as e:
print(e)
return np.nan
def calculate_shear(self):
"""Calculate shear strain"""
self.shear = self._calculate_deformation(self._calculate_shear_residue)
def _calculate_strain_residue(self, neigh_tensor1, neigh_tensor2, **kwargs):
"""Calculate effective strain given a pair of neighborhood tensors"""
# Rotate neighbourhood tensor and calculate Euclidean distance
nt3 = rotate_points(neigh_tensor2, neigh_tensor1)
if not self.force_absolute:
# Divide by length to get strain
es = np.sum(np.linalg.norm(nt3 - neigh_tensor1, axis=1) / np.linalg.norm(neigh_tensor1, axis=1))
else:
es = np.sum(np.linalg.norm(nt3 - neigh_tensor1, axis=1))
if not self.force_nonorm:
# Normalize ES by number of neighbors
es /= len(neigh_tensor1)
return es
def calculate_strain(self):
"""Calculate effective strain"""
self.strain = self._calculate_deformation(self._calculate_strain_residue)
def _calculate_non_affine_residue(self, neigh_tensor1, neigh_tensor2, **kwargs):
"""Calculate non-affine strain given a pair of neighborhood tensors"""
# Find the deformation gradient tensor, F
F, residuals = np.linalg.lstsq(neigh_tensor1, neigh_tensor2)[:2]
if not self.force_nonorm:
return np.nansum(residuals)
else:
return np.nansum(residuals) / len(neigh_tensor1)
def calculate_non_affine(self):
"""Calculate non-affine strain"""
self.non_affine = self._calculate_deformation(self._calculate_non_affine_residue)
#######################################################
### Root-Mean-Square Deviation
### Superimpose structures and calculate RMSD
def calculate_rmsd(self):
"""
Calculate RMSD
Cannot calculate RMSD with averaged neighborhoods,
so if an AverageProtein object is passsed, we simply
take the first Protein object and use it to calculate RMSD.
"""
coords = []
for prot in self.proteins:
if isinstance(prot, Protein):
coords.append(prot.coord)
elif isinstance(prot, AverageProtein):
coords.append(prot.proteins[0].coord)
print("WARNING! Trying to calculate RMSD with an AverageProtein object. " + \
"Since this is not possible, an embedded Protein object is used instead.")
c1, c2 = coords
# Remove NAN values
idx = ~np.any(np.isnan(c1) | np.isnan(c2), axis=1)
c1, c2 = c1[idx], c2[idx]
c1 = c1 - np.mean(c1, axis=0).reshape(1, 3)
c2 = c2 - np.mean(c2, axis=0).reshape(1, 3)
c3 = rotate_points(c2, c1)
sd = np.sum((c1 - c3)**2, axis=1)
self.rmsd_per_residue = np.sqrt(sd)
self.rmsd = np.sqrt(np.mean(sd))
Classes
class Deformation (protein_1, protein_2, **kwargs)-
Deformation object description:
The Deformation object takes two protein objects (Protein, AverageProtein), and calculates a range of deformation metrics.
Methods
Effective Strain : strain mean relative change in distance between neighbors in two neighborhoods [McBride, J. M., Polev, K., Abdirasulov, A., Reinharz, V., Grzybowski, B. A., & Tlusty, T. (2023), 'AlphaFold2 can predict single-mutation effects', biorXiv]
Shear Strain : shear magnitude of the off-diagonal components of the strain tensor [Eckmann, J P, J. Rougemont, and T. Tlusty (2019), “Colloquium: Proteins: The physics of amorphous evolving matter,” Rev. Mod. Phys. 91, 031001]
Non-Affine Strain : non_affine non-linear component of strain, defined by the residual of the fit of the strain tensor to two neighborhood tensors [Falk, M L, and J. S. Langer (1998), 'Dynamics of viscoplastic deformation in amorphous solids', Phys. Rev. E 57, 7192–7205]
Local Distance Difference (LDD) : ldd L2 norm of the difference between distances between neighbors in two neighborhoods [McBride, J. M., Polev, K., Abdirasulov, A., Reinharz, V., Grzybowski, B. A., & Tlusty, T. (2023), 'AlphaFold2 can predict single-mutation effects', biorXiv]
Local Distance Difference Test (LDDT) : lddt fraction of differences between distances between neighbors that are within some set thresholds in two neighborhoods [Mariani, Valerio, Marco Biasini, Alessandro Barbato, and Torsten Schwede (2013), 'lDDT: a local superposition-free score for comparing protein structures and models using distance difference tests', Bioinformatics 29 (21), 2722–2728]
Neighborhood Distance : neighborhood_dist L2 norm of the difference between two neighborhood tensors [McBride, J. M., Polev, K., Abdirasulov, A., Reinharz, V., Grzybowski, B. A., & Tlusty, T. (2023), 'AlphaFold2 can predict single-mutation effects', biorXiv]
Root Mean Square Deviation (RMSD) : rmsd L2 norm of the difference between two protein structures
Distance to Mutated Site : mut_dist Distance (angstroms) of each residue to the nearest mutated site
Attributes
all_methods:list- list of all methods: ['mut_dist', 'strain', 'shear', 'non_affine', 'ldd', 'lddt', 'neighborhood_dist', 'rmsd']
sub_pos:list(int)- list of sequence indices where sequences of protein_1 and protein_2 are different; indices start from zero
sub_str:list(str)- sequence substitutions written in the format {amino_acid_1}{index}{amino_acid_2}; indices start from one (as is convention)
Args
protein_1:(Protein, AverageProtein)protein_2:(Protein, AverageProtein)
Kwargs
method : (str, list, set, tuple, np.ndarray) : default = ["strain"] method(s) to be used when calling self.run(); 'all' results in all methods being used; multiple methods can be passed as an iterable object
lddt_cutoffs : list : default = [0.5, 1, 2, 4] specify a list of thresholds used to calculate LDDT
neigh_cut : float : default = 13.0 cutoff radius used to determine neighborhood; can be used to update Protein and AverageProtein objects
force_cutoff : bool : default = False recalculate Protein and AverageProtein neighborhoods if Deformation.neigh_cut != Protein.neigh_cut
force_norm : bool : default = False force a metric that is not normally normalized by the number of neighbors to be normalized in this way
force_nonorm : bool : default = False force a metric that is normally normalized by the number of neighbors to not be normalized in this way
force_relative : bool : default = False force a metric that is not normally normalized by neighbor distance to be normalized in this way
force_absolute : bool : default = False force a metric that is normally normalized by neighbor distance to not be normalized in this way
force_nonorm : bool : default = False force a metric that is normally normalized by the number of neighbors to not be normalized in this way
verbose : bool : default = True print a summary of the deformation calculation
Expand source code
class Deformation: """ Deformation object description: The Deformation object takes two protein objects (Protein, AverageProtein), and calculates a range of deformation metrics. Methods ------- Effective Strain : strain mean relative change in distance between neighbors in two neighborhoods [McBride, J. M., Polev, K., Abdirasulov, A., Reinharz, V., Grzybowski, B. A., & Tlusty, T. (2023), 'AlphaFold2 can predict single-mutation effects', biorXiv] Shear Strain : shear magnitude of the off-diagonal components of the strain tensor [Eckmann, J P, J. Rougemont, and T. Tlusty (2019), “Colloquium: Proteins: The physics of amorphous evolving matter,” Rev. Mod. Phys. 91, 031001] Non-Affine Strain : non_affine non-linear component of strain, defined by the residual of the fit of the strain tensor to two neighborhood tensors [Falk, M L, and J. S. Langer (1998), 'Dynamics of viscoplastic deformation in amorphous solids', Phys. Rev. E 57, 7192–7205] Local Distance Difference (LDD) : ldd L2 norm of the difference between distances between neighbors in two neighborhoods [McBride, J. M., Polev, K., Abdirasulov, A., Reinharz, V., Grzybowski, B. A., & Tlusty, T. (2023), 'AlphaFold2 can predict single-mutation effects', biorXiv] Local Distance Difference Test (LDDT) : lddt fraction of differences between distances between neighbors that are within some set thresholds in two neighborhoods [Mariani, Valerio, Marco Biasini, Alessandro Barbato, and Torsten Schwede (2013), 'lDDT: a local superposition-free score for comparing protein structures and models using distance difference tests', Bioinformatics 29 (21), 2722–2728] Neighborhood Distance : neighborhood_dist L2 norm of the difference between two neighborhood tensors [McBride, J. M., Polev, K., Abdirasulov, A., Reinharz, V., Grzybowski, B. A., & Tlusty, T. (2023), 'AlphaFold2 can predict single-mutation effects', biorXiv] Root Mean Square Deviation (RMSD) : rmsd L2 norm of the difference between two protein structures Distance to Mutated Site : mut_dist Distance (angstroms) of each residue to the nearest mutated site Attributes ---------- all_methods : list list of all methods: ['mut_dist', 'strain', 'shear', 'non_affine', 'ldd', 'lddt', 'neighborhood_dist', 'rmsd'] sub_pos : list(int) list of sequence indices where sequences of protein_1 and protein_2 are different; indices start from zero sub_str : list(str) sequence substitutions written in the format {amino_acid_1}{index}{amino_acid_2}; indices start from one (as is convention) """ def __init__(self, protein_1, protein_2, **kwargs): """ args ---------- protein_1: (Protein, AverageProtein) protein_2: (Protein, AverageProtein) kwargs ---------- method : (str, list, set, tuple, np.ndarray) : default = ["strain"] method(s) to be used when calling self.run(); 'all' results in all methods being used; multiple methods can be passed as an iterable object lddt_cutoffs : list : default = [0.5, 1, 2, 4] specify a list of thresholds used to calculate LDDT neigh_cut : float : default = 13.0 cutoff radius used to determine neighborhood; can be used to update Protein and AverageProtein objects force_cutoff : bool : default = False recalculate Protein and AverageProtein neighborhoods if Deformation.neigh_cut != Protein.neigh_cut force_norm : bool : default = False force a metric that is not normally normalized by the number of neighbors to be normalized in this way force_nonorm : bool : default = False force a metric that is normally normalized by the number of neighbors to not be normalized in this way force_relative : bool : default = False force a metric that is not normally normalized by neighbor distance to be normalized in this way force_absolute : bool : default = False force a metric that is normally normalized by neighbor distance to not be normalized in this way force_nonorm : bool : default = False force a metric that is normally normalized by the number of neighbors to not be normalized in this way verbose : bool : default = True print a summary of the deformation calculation """ self.prot1 = protein_1 self.prot2 = protein_2 self.proteins = [self.prot1, self.prot2] self.default_method = ["strain"] self.all_methods = ["mut_dist", "strain", "shear", "non_affine", "ldd", "lddt", "neighborhood_dist", "rmsd"] self.lddt_cutoffs = kwargs.get("lddt_cutoffs", [0.5, 1, 2, 4]) self.method = kwargs.get('method', self.default_method.copy()) self.neigh_cut = kwargs.get('neigh_cut', 13.0) self.force_cutoff = kwargs.get('force_cutoff', False) self.force_norm = kwargs.get('force_norm', False) self.force_nonorm = kwargs.get('force_nonorm', False) self.force_relative = kwargs.get('force_relative', False) self.force_absolute = kwargs.get('force_absolute', False) self.verbose = kwargs.get('verbose', True) self.sub_pos = None self.sub_str = '' self._parse_input() self._parse_method() if self.verbose: self._print_inputs_summary() def _print_inputs_summary(self): """Print summary of the deformation calculation""" print(f"Comparing {self.prot1} with {self.prot2}.") print(f"Sequence length :: {self.prot1.seq_len}") nmiss1 = sum([len(i) == 0 for i in self.prot1.neigh_idx]) nmiss2 = sum([len(i) == 0 for i in self.prot2.neigh_idx]) print(f"Number of residues excluded due to missing coordinates, or due to low pLDDT" + \ f" / high B-factor ::\n\tProtA, {nmiss1}\n\tProtB, {nmiss2}") print(f"Amino acid substitutions :: {' '.join(self.sub_str)}") print(f"Methods to run :: {' '.join(self.method)}") def _parse_input(self): """ Accepts protein (Protein, AverageProtein) objects as input. Checks protein objects for consistency in neighborhood cutoff radii, protein size, and checks sequences for differences (mutations). """ # Check input types for prot in self.proteins: if not isinstance(prot, (AverageProtein, Protein)): raise Exception(f"Input object type {type(prot)} is not supported.") # Check that neighbor cutoff definitions are consistent. # This method also loads neighborhoods if they are not loaded already. self._check_neighborhoods() # Check that protein coordinate arrays are the same length l1 = len(self.prot1.neigh_idx) l2 = len(self.prot2.neigh_idx) if l1 != l2: raise Exception("Protein coordinate arrays are not the same length: " + \ f"Protein A has {l1} residues, while " + \ f"Protein B has {l2} residues.\n" + \ "If using PDB files with missing coordinates, use the --fix_pdb option.") try: self.sub_pos = get_mutation_position(self.prot1.sequence, self.prot2.sequence) self.sub_str = self._get_substitution_strings() except AttributeError as E: raise AttributeError("Sequence is not defined for Protein object") def _get_substitution_strings(self): """Get conventional representation of mutation as a string""" return [f"{self.prot1.sequence[i]}{i+1}{self.prot2.sequence[i]}" for i in self.sub_pos] def _update_protein_neighborhood(self, prot, neigh_cut): """Check Protein neighbood and recalculate if neigh_cut is wrong""" # If not calculated yet... calculate if len(prot.neigh_idx) == 0: prot.neigh_cut = neigh_cut prot.get_local_neighborhood() # If neigh cut is wrong... calculate elif prot.neigh_cut != neigh_cut: print(f"Recalculating Protein with new neighbor cutoff = {neigh_cut}") prot.neigh_cut = neigh_cut prot.get_local_neighborhood() def _update_averageProtein_neighborhood(self, prot, neigh_cut): """Check AverageProtein neighbood and recalculate if neigh_cut is wrong""" # If not calculated yet... calculate if len(prot.neigh_idx) == 0: prot.neigh_cut = neigh_cut prot.get_average_structure() # If neigh cut is wrong... calculate elif prot.neigh_cut != neigh_cut: print(f"Recalculating AverageProtein with new neighbor cutoff = {neigh_cut}") prot.neigh_cut = neigh_cut prot.recalculate_average_structure() # Check for consistency in the use of neighbor cutoffs def _check_neighborhoods(self): """ Run some consistency checks on the two (Protein, AverageProtein) objects. Recalculate neighborhoods if parameters (neigh_cut, min_plddt, max_bfactor) have changed. """ neigh_cut = [prot.neigh_cut for prot in self.proteins] # Check for consistency between Protein objects. # If they are inconsistent, either force them to use Deformation.neigh_cutoff, # or exit if neigh_cut[0] != neigh_cut[1]: if self.force_cutoff: for prot in self.proteins: if isinstance(prot, Protein): self._update_protein_neighborhood(prot, self.neigh_cut) if isinstance(prot, AverageProtein): self._update_averageProtein_neighborhood(prot, self.neigh_cut) else: raise Exception("AverageProtein / Protein objects were created with different neighbor cutoffs!" + \ "\n\tYou need to use the same neighbor cutoff for each structure," + \ "\n\tor to automatically recalculate neighborhoods using Deformation.neigh_cutoff," +\ " use Deformation(..., force_cutoff=True)") return # If not "force_cutoff", then ensure Deformation.neigh_cutoff equals that of the Protein objects if not self.force_cutoff: if self.neigh_cut != neigh_cut[0]: self.neigh_cut = neigh_cut[0] print(f"WARNING! Resetting neighbour cutoff to {self.neigh_cut}, since this " + \ "value was used for the AverageProtein structure." + \ "\n\tTo override this, use Deformation(..., force_cutoff=True)") for i, prot in enumerate(self.proteins): if isinstance(prot, Protein): self._update_protein_neighborhood(prot, self.neigh_cut) if isinstance(prot, AverageProtein): self._update_averageProtein_neighborhood(prot, self.neigh_cut) if self.force_cutoff: for i, prot in enumerate(self.proteins): prot.neigh_cut = self.neigh_cut prot.recalculate_average_structure() else: self.neigh_cut = neigh_cut[0] # Parse method, and ensure methods are acceptable def _parse_method(self): """Parse method into the appropriate format""" if isinstance(self.method, str): self.method = [self.method] if isinstance(self.method, (list, np.ndarray, set, tuple)): if len(self.method) == 1: if self.method[0] == 'all': self.method = self.all_methods.copy() else: method_list = [] for method in self.method: if method in self.all_methods: method_list.append(method) else: print(f"WARNING! {method} is not an accepted method") self.method = method_list else: self.method = [] if len(self.method) == 0: raise Exception("No acceptable method found!" + \ f"\nChoose one out of: all, {', '.join(self.all_methods)}") # Set method, and run through parse to check method validity def set_method(self, value): """Set the method(s) to be used by self.run() > (str, list, set, tuple, np.ndarray) > 'all' results in all methods being used; > multiple methods can be passed as an iterable object > list of all methods: > 'mut_dist' > 'strain' > 'shear' > 'non_affine' > 'ldd' > 'lddt' > 'neighborhood_dist' > 'rmsd' """ self.method = value self._parse_method() def save_output(self, path_out): """Save outputs to CSV file""" # Load any deformation that was calculated deform = {} for m in self.all_methods: if hasattr(self, m): if m != 'rmsd': deform[m] = getattr(self, m) else: deform[m] = self.rmsd_per_residue # Only save output if deformation was calculated if not len(deform): raise Exception("ERROR! Cannot save output if there is none!") # Add sequence indices, residue names else: type_names = ['residue_index', 'protA_resname', 'protB_resname'] res_data = [np.arange(len(self.prot1.sequence)) + 1, self.prot1.sequence, self.prot2.sequence] output = {k: d for k, d in zip(type_names, res_data) if not isinstance(d, type(None))} ######################################## ### NEED TO ADD NUMBER OF NEIGHBORS ######################################## output.update(deform) pd.DataFrame(output).to_csv(path_out, index=False) def run(self): """ Runs through all of the methods in self.method. The output of each method, {m}, is stored in self.{m} (e.g. self.strain); RMSD is stored as self.rmsd (whole protein), and as RMSD per residue, self.rmsd_per_residue """ # Calculate deformation based on the specified method for method in self.method: self._run_analysis(method) def _run_analysis(self, method): """Run the analysis code for a particular method""" analyses = {'mut_dist': self.calculate_dist_from_mutation, 'strain': self.calculate_strain, 'shear': self.calculate_shear, 'non_affine': self.calculate_non_affine, 'ldd': self.calculate_ldd, 'lddt': self.calculate_lddt, 'neighborhood_dist': self.calculate_neighborhood_dist, 'rmsd': self.calculate_rmsd} analyses[method]() def _get_shared_indices(self): """Get shared indices between two neighborhoods""" self.shared_indices = [] for i in range(self.prot1.seq_len): # If no data for residue, shared indices is empty if (not len(self.prot1.neigh_idx[i])) | (not len(self.prot2.neigh_idx[i])): self.shared_indices.append(([], [])) else: self.shared_indices.append(get_shared_indices(self.prot1.neigh_idx[i], self.prot2.neigh_idx[i])) # Calculate distance from closest mutation def calculate_dist_from_mutation(self): """Calcualte distance from the nearest mutated residue""" # If none differ, then return np.nan if not len(self.sub_pos): print("WARNING! Trying to calculate distance from mutation, while comparing identical sequences") return np.zeros(self.prot1.seq_len) * np.nan # Calculate mindist using the full array (inc. nan) mut_dist1 = self.prot1.dist_mat[:,self.sub_pos] mut_dist2 = self.prot2.dist_mat[:,self.sub_pos] # Average the distance across both structures, # and get the minimum distance per residue to a mutated position self.mut_dist = np.nanmin(0.5 * (mut_dist1 + mut_dist2), axis=1) def _calculate_deformation(self, deformation_method): """Find shared residue indices between two neighborhoods and calculate deformation per residue""" deformation = np.zeros(self.prot1.seq_len, float) * np.nan if not hasattr(self, "shared_indices"): self._get_shared_indices() kwargs = {arg: getattr(self, arg) for arg in ["force_relative", "force_norm", "force_absolute", "force_nonorm"]} for i in range(self.prot1.seq_len): # Get shared indices i1, i2 = self.shared_indices[i] # If no shared indices, leave np.nan if not len(i1): continue deformation[i] = deformation_method(self.prot1.neigh_tensor[i][i1], self.prot2.neigh_tensor[i][i2], **kwargs) return deformation def _calculate_lddt_residue(self, neigh_tensor1, neigh_tensor2, **kwargs): """Calculate LDDT given a pair of neighborhood tensors""" # Get local distance vectors v1 = np.linalg.norm(neigh_tensor1, axis=1) v2 = np.linalg.norm(neigh_tensor2, axis=1) # Get local distance difference vector dv = v2 - v1 return np.sum([np.sum(dv<=cut) for cut in self.lddt_cutoffs]) / (len(self.lddt_cutoffs) * len(dv)) def calculate_lddt(self): """Calculate LDDT""" self.lddt = self._calculate_deformation(self._calculate_lddt_residue) def _calculate_ldd_residue(self, neigh_tensor1, neigh_tensor2, **kwargs): """Calculate LDD given a pair of neighborhood tensors""" # Get local distance vectors v1 = np.linalg.norm(neigh_tensor1, axis=1) v2 = np.linalg.norm(neigh_tensor2, axis=1) # Get local distance difference vector dv = v2 - v1 if force_relative: # Normalize LDD by distance dv = dv / v1 # Calculate local distance difference if force_norm: # Normalize LDD by number of neighbors return np.linalg.norm(dv) / len(i1) else: return np.linalg.norm(dv) def calculate_ldd(self): """Calculate LDD""" self.ldd = self._calculate_deformation(self._calculate_ldd_residue) def _calculate_nd_residue(self, neigh_tensor1, neigh_tensor2, **kwargs): """Calculate neighborhood distance given a pair of neighborhood tensors""" # Rotate neighbourhood tensor and calculate Euclidean distance nd = np.linalg.norm(rotate_points(neigh_tensor2, neigh_tensor1) - neigh_tensor1) if self.force_norm: return nd / len(i1) else: return nd def calculate_neighborhood_dist(self): """Calculate neighborhood_distance""" self.neighbor_distance = self._calculate_deformation(self._calculate_nd_residue) def _calculate_shear_residue(self, neigh_tensor1, neigh_tensor2, **kwargs): """Calculate shear strain given a pair of neighborhood tensors""" u1 = neigh_tensor1 u2 = neigh_tensor2 try: duu = u2 @ u2.T - u1 @ u1.T uu = np.linalg.inv(u1.T @ u1) C = 0.5 * (uu @ u1.T @ duu @ u1 @ uu) return 0.5 * np.sum(np.diag(C@C) - np.diag(C)**2) except Exception as e: print(e) return np.nan def calculate_shear(self): """Calculate shear strain""" self.shear = self._calculate_deformation(self._calculate_shear_residue) def _calculate_strain_residue(self, neigh_tensor1, neigh_tensor2, **kwargs): """Calculate effective strain given a pair of neighborhood tensors""" # Rotate neighbourhood tensor and calculate Euclidean distance nt3 = rotate_points(neigh_tensor2, neigh_tensor1) if not self.force_absolute: # Divide by length to get strain es = np.sum(np.linalg.norm(nt3 - neigh_tensor1, axis=1) / np.linalg.norm(neigh_tensor1, axis=1)) else: es = np.sum(np.linalg.norm(nt3 - neigh_tensor1, axis=1)) if not self.force_nonorm: # Normalize ES by number of neighbors es /= len(neigh_tensor1) return es def calculate_strain(self): """Calculate effective strain""" self.strain = self._calculate_deformation(self._calculate_strain_residue) def _calculate_non_affine_residue(self, neigh_tensor1, neigh_tensor2, **kwargs): """Calculate non-affine strain given a pair of neighborhood tensors""" # Find the deformation gradient tensor, F F, residuals = np.linalg.lstsq(neigh_tensor1, neigh_tensor2)[:2] if not self.force_nonorm: return np.nansum(residuals) else: return np.nansum(residuals) / len(neigh_tensor1) def calculate_non_affine(self): """Calculate non-affine strain""" self.non_affine = self._calculate_deformation(self._calculate_non_affine_residue) ####################################################### ### Root-Mean-Square Deviation ### Superimpose structures and calculate RMSD def calculate_rmsd(self): """ Calculate RMSD Cannot calculate RMSD with averaged neighborhoods, so if an AverageProtein object is passsed, we simply take the first Protein object and use it to calculate RMSD. """ coords = [] for prot in self.proteins: if isinstance(prot, Protein): coords.append(prot.coord) elif isinstance(prot, AverageProtein): coords.append(prot.proteins[0].coord) print("WARNING! Trying to calculate RMSD with an AverageProtein object. " + \ "Since this is not possible, an embedded Protein object is used instead.") c1, c2 = coords # Remove NAN values idx = ~np.any(np.isnan(c1) | np.isnan(c2), axis=1) c1, c2 = c1[idx], c2[idx] c1 = c1 - np.mean(c1, axis=0).reshape(1, 3) c2 = c2 - np.mean(c2, axis=0).reshape(1, 3) c3 = rotate_points(c2, c1) sd = np.sum((c1 - c3)**2, axis=1) self.rmsd_per_residue = np.sqrt(sd) self.rmsd = np.sqrt(np.mean(sd))Methods
def calculate_dist_from_mutation(self)-
Calcualte distance from the nearest mutated residue
Expand source code
def calculate_dist_from_mutation(self): """Calcualte distance from the nearest mutated residue""" # If none differ, then return np.nan if not len(self.sub_pos): print("WARNING! Trying to calculate distance from mutation, while comparing identical sequences") return np.zeros(self.prot1.seq_len) * np.nan # Calculate mindist using the full array (inc. nan) mut_dist1 = self.prot1.dist_mat[:,self.sub_pos] mut_dist2 = self.prot2.dist_mat[:,self.sub_pos] # Average the distance across both structures, # and get the minimum distance per residue to a mutated position self.mut_dist = np.nanmin(0.5 * (mut_dist1 + mut_dist2), axis=1) def calculate_ldd(self)-
Calculate LDD
Expand source code
def calculate_ldd(self): """Calculate LDD""" self.ldd = self._calculate_deformation(self._calculate_ldd_residue) def calculate_lddt(self)-
Calculate LDDT
Expand source code
def calculate_lddt(self): """Calculate LDDT""" self.lddt = self._calculate_deformation(self._calculate_lddt_residue) def calculate_neighborhood_dist(self)-
Calculate neighborhood_distance
Expand source code
def calculate_neighborhood_dist(self): """Calculate neighborhood_distance""" self.neighbor_distance = self._calculate_deformation(self._calculate_nd_residue) def calculate_non_affine(self)-
Calculate non-affine strain
Expand source code
def calculate_non_affine(self): """Calculate non-affine strain""" self.non_affine = self._calculate_deformation(self._calculate_non_affine_residue) def calculate_rmsd(self)-
Calculate RMSD
Cannot calculate RMSD with averaged neighborhoods, so if an AverageProtein object is passsed, we simply take the first Protein object and use it to calculate RMSD.
Expand source code
def calculate_rmsd(self): """ Calculate RMSD Cannot calculate RMSD with averaged neighborhoods, so if an AverageProtein object is passsed, we simply take the first Protein object and use it to calculate RMSD. """ coords = [] for prot in self.proteins: if isinstance(prot, Protein): coords.append(prot.coord) elif isinstance(prot, AverageProtein): coords.append(prot.proteins[0].coord) print("WARNING! Trying to calculate RMSD with an AverageProtein object. " + \ "Since this is not possible, an embedded Protein object is used instead.") c1, c2 = coords # Remove NAN values idx = ~np.any(np.isnan(c1) | np.isnan(c2), axis=1) c1, c2 = c1[idx], c2[idx] c1 = c1 - np.mean(c1, axis=0).reshape(1, 3) c2 = c2 - np.mean(c2, axis=0).reshape(1, 3) c3 = rotate_points(c2, c1) sd = np.sum((c1 - c3)**2, axis=1) self.rmsd_per_residue = np.sqrt(sd) self.rmsd = np.sqrt(np.mean(sd)) def calculate_shear(self)-
Calculate shear strain
Expand source code
def calculate_shear(self): """Calculate shear strain""" self.shear = self._calculate_deformation(self._calculate_shear_residue) def calculate_strain(self)-
Calculate effective strain
Expand source code
def calculate_strain(self): """Calculate effective strain""" self.strain = self._calculate_deformation(self._calculate_strain_residue) def run(self)-
Runs through all of the methods in self.method.
The output of each method, {m}, is stored in self.{m} (e.g. self.strain); RMSD is stored as self.rmsd (whole protein), and as RMSD per residue, self.rmsd_per_residue
Expand source code
def run(self): """ Runs through all of the methods in self.method. The output of each method, {m}, is stored in self.{m} (e.g. self.strain); RMSD is stored as self.rmsd (whole protein), and as RMSD per residue, self.rmsd_per_residue """ # Calculate deformation based on the specified method for method in self.method: self._run_analysis(method) def save_output(self, path_out)-
Save outputs to CSV file
Expand source code
def save_output(self, path_out): """Save outputs to CSV file""" # Load any deformation that was calculated deform = {} for m in self.all_methods: if hasattr(self, m): if m != 'rmsd': deform[m] = getattr(self, m) else: deform[m] = self.rmsd_per_residue # Only save output if deformation was calculated if not len(deform): raise Exception("ERROR! Cannot save output if there is none!") # Add sequence indices, residue names else: type_names = ['residue_index', 'protA_resname', 'protB_resname'] res_data = [np.arange(len(self.prot1.sequence)) + 1, self.prot1.sequence, self.prot2.sequence] output = {k: d for k, d in zip(type_names, res_data) if not isinstance(d, type(None))} ######################################## ### NEED TO ADD NUMBER OF NEIGHBORS ######################################## output.update(deform) pd.DataFrame(output).to_csv(path_out, index=False) def set_method(self, value)-
Set the method(s) to be used by self.run()
(str, list, set, tuple, np.ndarray) > 'all' results in all methods being used; > multiple methods can be passed as an iterable object list of all methods: > 'mut_dist' > 'strain' > 'shear' > 'non_affine' > 'ldd' > 'lddt' > 'neighborhood_dist' > 'rmsd'
Expand source code
def set_method(self, value): """Set the method(s) to be used by self.run() > (str, list, set, tuple, np.ndarray) > 'all' results in all methods being used; > multiple methods can be passed as an iterable object > list of all methods: > 'mut_dist' > 'strain' > 'shear' > 'non_affine' > 'ldd' > 'lddt' > 'neighborhood_dist' > 'rmsd' """ self.method = value self._parse_method()