Module PDAnalysis.protein

Expand source code
from collections import Counter
from pathlib import Path

from pathlib import Path
import numpy as np
from scipy.spatial.distance import cdist

from .pdb_parser import parse_pdb_coordinates, parse_mmcif_coordinates, load_and_fix_pdb_data
from .utils import rotate_points


class Protein:
    """ Protein object description:

    The Protein object accepts a protein structure as input (PDB file, CIF file, atomic coordinates),
    and represents the protein conformation as a list of local neighborhoods per residue.

    Attributes
    ----------
    seq_len : int
        length of the protein sequence

    sequence : np.ndarray(seq_len)
        single-letter amino-acid sequence

    coord : np.ndarray(seq_len, 3)
        xyz coordinates of alpha-carbons, with nan values for missing residues
        and for residues excluded based on bfactor/pLDDT
        
    coord_raw : np.ndarray(seq_len, 3)
        xyz coordinates of alpha-carbons, exactly as read from file

    idx : np.ndarray(seq_len)
        residue indices

    bfactor : np.ndarray(seq_len)
        Bfactor (temperature factor); mirrors plddt

    plddt : np.ndarray(seq_len)
        pLDDT (AlphaFold-predicted confidence score); mirrors bfactor

    dist_mat : np.ndarray(seq_len, seq_len)
        alpha-carbon distance matrix

    neigh_idx : list
        list of neighbor indices for each residue

    neigh_tensor: list
        list of neighborhood tensors for each residue


    Methods
    -------

    get_local_neighborhood
        Gets the indices of neighbors for each residue (neigh_idx),
        and calculates neighborhood tensors (neigh_tensor).

    """
    def __init__(self, inp_obj, **kwargs):
        """
        args
        ----------
        inp_obj : (str, Path, np.ndarray)
            > path (str, Path) to file:
                  allowed filetypes:
                      PDB / CIF file (.pdb, .ent, .cif)
                      coordinate text file (.txt, .dat)
                      coordinate numpy binary file (.npy)
            > coordinates (np.ndarray)

        kwargs
        ----------

        max_bfactor : float : default = 0.0
            if nonzero, excludes residues with zscore(bfactor) greater than max_bfactor

        min_plddt : float : default = 0.0
            if nonzero, excludes residues with pLDDT lower than min_plddt

        neigh_cut : float : default = 13.0
            cutoff radius used to determine neighborhood

        chain : str : default = ''
            if not '', only parses coordinates for a specific chain

        fix_pdb : bool : default = False
            if True, uses the SEQRES data to assign residue indices;
            neighborhoods of missing residues are stored as empty lists;
            this facilitates comparison of PDB structures that are missing different sets of residues

        delimiter : str : default = ' '
            delimiter for loading coordinates in text files;
            passed to np.loadtxt

        skip_rows : int : default = 0
            number of rows to skip at the top of text files;
            passed to np.loadtxt
            
        """
        # Parameters
        self.max_bfactor = kwargs.get('max_bfactor', 0.0)
        self.min_plddt = kwargs.get('min_plddt', 0.0)
        self.neigh_cut = kwargs.get('neigh_cut', 13.0)
        self.chain = kwargs.get('chain', '')
        self.fix_pdb =  kwargs.get('fix_pdb', False)
        self.delimiter = kwargs.get('delimiter', ' ')
        self.skip_rows = kwargs.get('skip_rows', 0)

        # Defaults
        self.sequence = None
        self.idx = None
        self.plddt = None
        self.bfactor = None
        self.dist_mat = None
        self.neigh_idx = []
        self.neigh_tensor = []
        self.seq_len = None         # Number of residues

        # Load data
        self._parse_input(inp_obj)
        self.get_local_neighborhood()


    def _parse_input(self, inp_obj):
        """
        Parses a range of input object types (.pdb, .ent, .cif, .txt, .dat, .npy)

        Extracts: coordinates, residue indices, amino-acid sequence, bfactor/pLDDT
        """

        if isinstance(inp_obj, (str, Path)):
            ext = Path(inp_obj).suffix

            ### Read PDB / mmCIF file
            if (ext in ['.pdb', '.cif', '.ent']):
                self._load_data_from_path(inp_obj, ext)

            ### Read text coordinate file
            elif (ext in ['.txt', '.dat']):
                try:
                    print(f"WARNING! Ambiguity in file format {ext}. Attempting to read coordinates.")
                    self.coord = np.loadtxt(inp_obj, delimiter=self.delimiter, skip_rows=self.skip_rows)
                    self.idx = np.arange(len(self.coord))
                except:
                    raise Exception("Could not read coordinates from input file!")

            ### Read nump binary coordinate file
            elif (ext == '.npy'):
                self.coord = np.load(inp_obj)
                self.idx = np.arange(len(self.coord))

        elif isinstance(inp_obj, np.ndarray):
            self.coord = inp_obj
            self.idx = np.arange(len(self.coord))

        else:
            raise Exception(f"Input object type {type(inp_obj)} is not supported.")

        if not isinstance(self.sequence, type(None)):
            self.seq_len = len(self.sequence)
        else:
            self.seq_len = len(self.coord)


    def _load_data_from_path(self, path, ext):
        """Parses data from a path, depending on the file type"""
        if self.fix_pdb:
            data = load_and_fix_pdb_data(path, self.chain)
            # (indices in PDB files are inconsistent, so we reorder
            #  the indices so that they start from zero)

        else:
            if ext in [".pdb", ".ent"]:
                data = parse_pdb_coordinates(path, self.chain)
            elif ext in [".cif"]:
                data = parse_mmcif_coordinates(path, self.chain)

            # Reorder indices so that they start from zero
            # (by default, AF-predicted structures start counting at one)
            if data[1][0] != 0:
                data[1] = data[1] - data[1][0]

        self.coord_raw = data[0]
        self.idx = data[1]
        self.sequence = data[2]
        self.bfactor = data[3]
        self.plddt = data[3].copy()

        # Fill in 'disordered' (high bfactor, low plddt) with NaN values
        self._update_nan_coords()


    # Fill in 'disordered' (high bfactor, low plddt) with NaN values
    def _update_nan_coords(self):
        """Constructs a coordinate array that includes nan values for
        missing residues and for residues excluded based on bfactor/pLDDT"""

        self.coord = np.zeros((len(self.sequence), 3)) * np.nan
        self.coord[self.idx] = self.coord_raw.copy()

        bfactor = np.zeros(len(self.sequence)) * np.nan
        bfactor[self.idx] = self.bfactor.copy()
        self.bfactor = bfactor

        if self.max_bfactor > 0.0:
            bfactor_zscore = (self.bfactor - np.nanmean(self.bfactor)) / np.nanstd(self.bfactor)
            self.coord[(bfactor_zscore > self.max_bfactor) | (np.isnan(self.bfactor))] = np.nan
        elif self.min_plddt > 0.0:
            self.coord[self.plddt <= self.min_plddt] = np.nan


    def _get_dist_mat(self):
        """Calculates alpha-carbon distance matrix"""
        self.dist_mat = cdist(self.coord, self.coord)
    

    def get_local_neighborhood(self):
        """Extracts local neighborhoods for each residue"""
        if not isinstance(self.dist_mat, np.ndarray):
            self._get_dist_mat()
        self.neigh_idx = [np.where((d > 0) & (d <= self.neigh_cut) & (np.isfinite(d)))[0] for d in self.dist_mat]
        self._calculate_neighbor_tensor()


    def _calculate_neighbor_tensor(self):
        """Calculates the neighborhood tensor for each residue"""
        self.neigh_tensor = []
        for i, idx in enumerate(self.neigh_idx):
            self.neigh_tensor.append(self.coord[idx] - self.coord[[i]])



class AverageProtein:
    """ AverageProtein object description:

    The AverageProtein object accepts a protein structure as input (PDB file, CIF file, atomic coordinates),
    and represents the protein conformation as a list of local neighborhoods per residue.

    Attributes
    ----------
    seq_len : int
        length of the protein sequence

    num_repeat : int
        number of proteins used to create an AverageProtein

    sequence : np.ndarray(seq_len)
        single-letter amino-acid sequence

    coord : np.ndarray(seq_len, 3)
        xyz coordinates of alpha-carbons, with nan values for missing residues
        and for residues excluded based on bfactor/pLDDT
        
    coord_raw : np.ndarray(seq_len, 3)
        xyz coordinates of alpha-carbons, exactly as read from file

    bfactor : np.ndarray(seq_len)
        Bfactor (temperature factor); mirrors plddt

    plddt : np.ndarray(seq_len)
        pLDDT (AlphaFold-predicted confidence score); mirrors bfactor

    dist_mat : np.ndarray(seq_len, seq_len)
        alpha-carbon distance matrix

    neigh_idx : list
        list of neighbor indices for each residue

    neigh_tensor: list
        list of average neighborhood tensors for each residue


    Methods
    -------

    get_average_structure
        Gets the indices of neighbors for each residue (neigh_idx),
        and calculates neighborhood tensors (neigh_tensor).

    recalculate_average_structure
        Recalculates the average structure after changing parameters.

    """
    def __init__(self, inp_obj, **kwargs):
        """
        args
        ----------
        inp_obj : list of (Protein, str, Path, np.ndarray)
            > Protein objects
            > path (str, Path) to file:
                  allowed filetypes:
                      PDB / CIF file (.pdb, .ent, .cif)
                      coordinate text file (.txt, .dat)
                      coordinate numpy binary file (.npy)
            > coordinates (np.ndarray)

        kwargs
        ----------

        max_bfactor : float : default = 0.0
            if nonzero, excludes residues with zscore(bfactor) greater than max_bfactor

        min_plddt : float : default = 0.0
            if nonzero, excludes residues with pLDDT lower than min_plddt

        neigh_cut : float : default = 13.0
            cutoff radius used to determine neighborhood

        chain : str : default = ''
            if not '', only parses coordinates for a specific chain

        fix_pdb : bool : default = False
            if True, uses the SEQRES data to assign residue indices;
            neighborhoods of missing residues are stored as empty lists;
            this facilitates comparison of PDB structures that are missing different sets of residues

        delimiter : str : default = ' '
            delimiter for loading coordinates in text files;
            passed to np.loadtxt

        skip_rows : int : default = 0
            number of rows to skip at the top of text files;
            passed to np.loadtxt
            
        average_plddt : bool : False
            average plddt across all Protein objects;
            otherwise uses plddt of the first Protein object
            
        average_bfactor : bool : False
            average bfactor across all Protein objects;
            otherwise uses bfactor of the first Protein object
            
        average_dist_mat : bool : False
            average dist_mat across all Protein objects;
            otherwise uses dist_mat of the first Protein object
            
        """
        # Parameters
        self.max_bfactor =      kwargs.get('max_bfactor', 0.0)
        self.min_plddt =        kwargs.get('min_plddt', 70.0)
        self.neigh_cut =        kwargs.get('neigh_cut', 13.0)
        self.chain =            kwargs.get('chain', '')
        self.fix_pdb =  kwargs.get('fix_pdb', False)
        self.delimiter = kwargs.get('delimiter', ' ')
        self.skip_rows = kwargs.get('skip_rows', 0)

        self.average_plddt =    kwargs.get('average_plddt', False)
        self.average_bfactor =  kwargs.get('average_bfactor', False)
        self.average_dist_mat = kwargs.get('average_bfactor', False)

        # Defaults
        self.plddt = None
        self.bfactor = None
        self.sequence = None
        self.dist_mat = None
        self.neigh_idx = []
        self.neigh_tensor = []
        self.seq_len = None         # Number of residues
        self.num_repeat = None      # Number of structures

        # Load data
        self._parse_input(inp_obj, **kwargs)
        self.get_average_structure()


    def _parse_input(self, inp_obj, **kwargs):
        """
        Parses a list of objects

        Parses a range of input object types (.pdb, .ent, .cif, .txt, .dat, .npy)

        Objects in a list need to be the same type

        Extracts: Protein objects
        """
        self.proteins = []
        # Ensure that input is a list or an array
        if not isinstance(inp_obj, (list, np.ndarray)):
            raise Exception(f"Input object type {type(inp_obj)} is not supported. Acceptable types:" + \
                             "\n\tA list of Protein objects" + \
                             "\n\tA list of Protein paths to pdb / mmcif files" + \
                             "\n\tA list of numpy arrays of atomic coordinates")

        # Parse inputs to get a list of Protein objects
        for item in inp_obj:
            if isinstance(item, Protein):
                self.proteins.append(item)

            elif isinstance(item, (str, Path, np.ndarray)):
                self.proteins.append(Protein(item, **kwargs))

            else:
                raise Exception(f"Input object type {type(item)} is not supported.")

        # Set pLDDT
        if self.average_plddt:
            try:
                self.plddt = np.mean([protein.plddt for protein in self.proteins], axis=0)
            except TypeError:
                self.plddt = None
        else:
            self.plddt = self.proteins[0].plddt

        # Set Bfactor
        if self.average_bfactor:
            try:
                self.bfactor = np.mean([protein.bfactor for protein in self.proteins], axis=0)
            except TypeError:
                self.bfactor = None
        else:
            self.bfactor = self.proteins[0].bfactor

        # Set Distance Matrix
        if self.average_dist_mat:
            self.dist_mat = np.mean([protein.dist_mat for protein in self.proteins], axis=0)
        else:
            self.dist_mat = self.proteins[0].dist_mat.copy()


        self.sequence = self.proteins[0].sequence
        self.seq_len = self.proteins[0].seq_len
        self.num_repeat = len(self.proteins)


    def get_average_structure(self):
        """Get averaged local neighborhoods"""
        self._get_local_neighborhood()

        # Only include neighbors for each residue that are neighbors in all structures
        self._consolidate_neighbor_lists()
        
        # Rotate neighborhood tensors to arbitrary reference neighborhoods,
        # then average over repeat structures
        self._rotate_and_average_neighbor_tensors()


    def _get_local_neighborhood(self):
        """Get local neighborhoods per residue"""
        for protein in self.proteins:
            # If not yet calculated, get the local neighborhood
            if len(self.neigh_idx) == 0:
                kwargs = {a:getattr(self, a) for a in ['min_plddt', 'max_bfactor', 'neigh_cut']}
                for k, v in kwargs.items():
                    setattr(protein, k, v)
                protein.get_local_neighborhood()


    def _consolidate_neighbor_lists(self):
        """Exclude neighbors that are not present in all Protein objects"""
        self.neigh_idx = []
        for i in range(self.seq_len):
            # Count how many times indices are included in neighbor lists
            idx_count = Counter(j for protein in self.proteins for j in protein.neigh_idx[i])
            # Only include indices that are in all neighbor lists
            self.neigh_idx.append([j for j, count in idx_count.items() if count == self.num_repeat])


    ### For each residue j, rotate all neighbourhoods to match the first one,
    ### then average over repeat structures
    def _rotate_and_average_neighbor_tensors(self):
        """Align neighborhood tensors by rotating to match the
           neighborhood tensor of the first Protein object."""
        self.neigh_tensor = []
        for i in range(self.seq_len):
            # If no neighbors..
            if not len(self.neigh_idx[i]):
                self.neigh_tensor.append(np.empty((0,3)))
                continue

            for j in range(self.num_repeat):
                # Only include the rows of the tensor that correspond to
                # the consolidated neighbor list
                idx = [i0 for i0, j0 in enumerate(self.proteins[j].neigh_idx[i]) if j0 in self.neigh_idx[i]]

                if not j:
                    # Do not rotate the first example for residue j
                    # Initialize the list
                    tensor = [self.proteins[j].neigh_tensor[i][idx]]

                else:
                    # Rotate tensor so that it matches the first (reference) tensor
                    rotated_tensor = rotate_points(self.proteins[j].neigh_tensor[i][idx], tensor[0])
                    tensor.append(rotated_tensor)
            self.neigh_tensor.append(np.array(tensor).mean(axis=0))


    def recalculate_average_structure(self):
        """Recalculate average structure after changing parameters""" 
        for protein in self.proteins:
            changed = np.zeros(3, bool)
            for i, attr in enumerate(['min_plddt', 'max_bfactor', 'neigh_cut']):
                old, new = getattr(protein, attr), getattr(self, attr)
                if old != new:
                    setattr(protein, attr, new)
                    changed[i] = True

            # If nothing has changed, exit function
            if np.all(changed == False):
                return

            # If "min_plddt" or "max_bfactor" have changed,
            # recalculate the coords as NaN values may be different
            if np.any(changed[:2] == True):
                protein._update_nan_coords()
                protein._get_dist_mat()

            # Recalculate neighborhoods with updated neighbor cutoff
            protein.get_local_neighborhood()

        self._consolidate_neighbor_lists()
        self._rotate_and_average_neighbor_tensors()

    

Classes

class AverageProtein (inp_obj, **kwargs)

AverageProtein object description:

The AverageProtein object accepts a protein structure as input (PDB file, CIF file, atomic coordinates), and represents the protein conformation as a list of local neighborhoods per residue.

Attributes

seq_len : int
length of the protein sequence
num_repeat : int
number of proteins used to create an AverageProtein
sequence : np.ndarray(seq_len)
single-letter amino-acid sequence
coord : np.ndarray(seq_len, 3)
xyz coordinates of alpha-carbons, with nan values for missing residues and for residues excluded based on bfactor/pLDDT
coord_raw : np.ndarray(seq_len, 3)
xyz coordinates of alpha-carbons, exactly as read from file
bfactor : np.ndarray(seq_len)
Bfactor (temperature factor); mirrors plddt
plddt : np.ndarray(seq_len)
pLDDT (AlphaFold-predicted confidence score); mirrors bfactor
dist_mat : np.ndarray(seq_len, seq_len)
alpha-carbon distance matrix
neigh_idx : list
list of neighbor indices for each residue
neigh_tensor : list
list of average neighborhood tensors for each residue

Methods

get_average_structure Gets the indices of neighbors for each residue (neigh_idx), and calculates neighborhood tensors (neigh_tensor).

recalculate_average_structure Recalculates the average structure after changing parameters.

Args

inp_obj : list of (Protein, str, Path, np.ndarray)

Protein objects path (str, Path) to file: allowed filetypes: PDB / CIF file (.pdb, .ent, .cif) coordinate text file (.txt, .dat) coordinate numpy binary file (.npy) coordinates (np.ndarray)

Kwargs

max_bfactor : float : default = 0.0 if nonzero, excludes residues with zscore(bfactor) greater than max_bfactor

min_plddt : float : default = 0.0 if nonzero, excludes residues with pLDDT lower than min_plddt

neigh_cut : float : default = 13.0 cutoff radius used to determine neighborhood

chain : str : default = '' if not '', only parses coordinates for a specific chain

fix_pdb : bool : default = False if True, uses the SEQRES data to assign residue indices; neighborhoods of missing residues are stored as empty lists; this facilitates comparison of PDB structures that are missing different sets of residues

delimiter : str : default = ' ' delimiter for loading coordinates in text files; passed to np.loadtxt

skip_rows : int : default = 0 number of rows to skip at the top of text files; passed to np.loadtxt

average_plddt : bool : False average plddt across all Protein objects; otherwise uses plddt of the first Protein object

average_bfactor : bool : False average bfactor across all Protein objects; otherwise uses bfactor of the first Protein object

average_dist_mat : bool : False average dist_mat across all Protein objects; otherwise uses dist_mat of the first Protein object

Expand source code
class AverageProtein:
    """ AverageProtein object description:

    The AverageProtein object accepts a protein structure as input (PDB file, CIF file, atomic coordinates),
    and represents the protein conformation as a list of local neighborhoods per residue.

    Attributes
    ----------
    seq_len : int
        length of the protein sequence

    num_repeat : int
        number of proteins used to create an AverageProtein

    sequence : np.ndarray(seq_len)
        single-letter amino-acid sequence

    coord : np.ndarray(seq_len, 3)
        xyz coordinates of alpha-carbons, with nan values for missing residues
        and for residues excluded based on bfactor/pLDDT
        
    coord_raw : np.ndarray(seq_len, 3)
        xyz coordinates of alpha-carbons, exactly as read from file

    bfactor : np.ndarray(seq_len)
        Bfactor (temperature factor); mirrors plddt

    plddt : np.ndarray(seq_len)
        pLDDT (AlphaFold-predicted confidence score); mirrors bfactor

    dist_mat : np.ndarray(seq_len, seq_len)
        alpha-carbon distance matrix

    neigh_idx : list
        list of neighbor indices for each residue

    neigh_tensor: list
        list of average neighborhood tensors for each residue


    Methods
    -------

    get_average_structure
        Gets the indices of neighbors for each residue (neigh_idx),
        and calculates neighborhood tensors (neigh_tensor).

    recalculate_average_structure
        Recalculates the average structure after changing parameters.

    """
    def __init__(self, inp_obj, **kwargs):
        """
        args
        ----------
        inp_obj : list of (Protein, str, Path, np.ndarray)
            > Protein objects
            > path (str, Path) to file:
                  allowed filetypes:
                      PDB / CIF file (.pdb, .ent, .cif)
                      coordinate text file (.txt, .dat)
                      coordinate numpy binary file (.npy)
            > coordinates (np.ndarray)

        kwargs
        ----------

        max_bfactor : float : default = 0.0
            if nonzero, excludes residues with zscore(bfactor) greater than max_bfactor

        min_plddt : float : default = 0.0
            if nonzero, excludes residues with pLDDT lower than min_plddt

        neigh_cut : float : default = 13.0
            cutoff radius used to determine neighborhood

        chain : str : default = ''
            if not '', only parses coordinates for a specific chain

        fix_pdb : bool : default = False
            if True, uses the SEQRES data to assign residue indices;
            neighborhoods of missing residues are stored as empty lists;
            this facilitates comparison of PDB structures that are missing different sets of residues

        delimiter : str : default = ' '
            delimiter for loading coordinates in text files;
            passed to np.loadtxt

        skip_rows : int : default = 0
            number of rows to skip at the top of text files;
            passed to np.loadtxt
            
        average_plddt : bool : False
            average plddt across all Protein objects;
            otherwise uses plddt of the first Protein object
            
        average_bfactor : bool : False
            average bfactor across all Protein objects;
            otherwise uses bfactor of the first Protein object
            
        average_dist_mat : bool : False
            average dist_mat across all Protein objects;
            otherwise uses dist_mat of the first Protein object
            
        """
        # Parameters
        self.max_bfactor =      kwargs.get('max_bfactor', 0.0)
        self.min_plddt =        kwargs.get('min_plddt', 70.0)
        self.neigh_cut =        kwargs.get('neigh_cut', 13.0)
        self.chain =            kwargs.get('chain', '')
        self.fix_pdb =  kwargs.get('fix_pdb', False)
        self.delimiter = kwargs.get('delimiter', ' ')
        self.skip_rows = kwargs.get('skip_rows', 0)

        self.average_plddt =    kwargs.get('average_plddt', False)
        self.average_bfactor =  kwargs.get('average_bfactor', False)
        self.average_dist_mat = kwargs.get('average_bfactor', False)

        # Defaults
        self.plddt = None
        self.bfactor = None
        self.sequence = None
        self.dist_mat = None
        self.neigh_idx = []
        self.neigh_tensor = []
        self.seq_len = None         # Number of residues
        self.num_repeat = None      # Number of structures

        # Load data
        self._parse_input(inp_obj, **kwargs)
        self.get_average_structure()


    def _parse_input(self, inp_obj, **kwargs):
        """
        Parses a list of objects

        Parses a range of input object types (.pdb, .ent, .cif, .txt, .dat, .npy)

        Objects in a list need to be the same type

        Extracts: Protein objects
        """
        self.proteins = []
        # Ensure that input is a list or an array
        if not isinstance(inp_obj, (list, np.ndarray)):
            raise Exception(f"Input object type {type(inp_obj)} is not supported. Acceptable types:" + \
                             "\n\tA list of Protein objects" + \
                             "\n\tA list of Protein paths to pdb / mmcif files" + \
                             "\n\tA list of numpy arrays of atomic coordinates")

        # Parse inputs to get a list of Protein objects
        for item in inp_obj:
            if isinstance(item, Protein):
                self.proteins.append(item)

            elif isinstance(item, (str, Path, np.ndarray)):
                self.proteins.append(Protein(item, **kwargs))

            else:
                raise Exception(f"Input object type {type(item)} is not supported.")

        # Set pLDDT
        if self.average_plddt:
            try:
                self.plddt = np.mean([protein.plddt for protein in self.proteins], axis=0)
            except TypeError:
                self.plddt = None
        else:
            self.plddt = self.proteins[0].plddt

        # Set Bfactor
        if self.average_bfactor:
            try:
                self.bfactor = np.mean([protein.bfactor for protein in self.proteins], axis=0)
            except TypeError:
                self.bfactor = None
        else:
            self.bfactor = self.proteins[0].bfactor

        # Set Distance Matrix
        if self.average_dist_mat:
            self.dist_mat = np.mean([protein.dist_mat for protein in self.proteins], axis=0)
        else:
            self.dist_mat = self.proteins[0].dist_mat.copy()


        self.sequence = self.proteins[0].sequence
        self.seq_len = self.proteins[0].seq_len
        self.num_repeat = len(self.proteins)


    def get_average_structure(self):
        """Get averaged local neighborhoods"""
        self._get_local_neighborhood()

        # Only include neighbors for each residue that are neighbors in all structures
        self._consolidate_neighbor_lists()
        
        # Rotate neighborhood tensors to arbitrary reference neighborhoods,
        # then average over repeat structures
        self._rotate_and_average_neighbor_tensors()


    def _get_local_neighborhood(self):
        """Get local neighborhoods per residue"""
        for protein in self.proteins:
            # If not yet calculated, get the local neighborhood
            if len(self.neigh_idx) == 0:
                kwargs = {a:getattr(self, a) for a in ['min_plddt', 'max_bfactor', 'neigh_cut']}
                for k, v in kwargs.items():
                    setattr(protein, k, v)
                protein.get_local_neighborhood()


    def _consolidate_neighbor_lists(self):
        """Exclude neighbors that are not present in all Protein objects"""
        self.neigh_idx = []
        for i in range(self.seq_len):
            # Count how many times indices are included in neighbor lists
            idx_count = Counter(j for protein in self.proteins for j in protein.neigh_idx[i])
            # Only include indices that are in all neighbor lists
            self.neigh_idx.append([j for j, count in idx_count.items() if count == self.num_repeat])


    ### For each residue j, rotate all neighbourhoods to match the first one,
    ### then average over repeat structures
    def _rotate_and_average_neighbor_tensors(self):
        """Align neighborhood tensors by rotating to match the
           neighborhood tensor of the first Protein object."""
        self.neigh_tensor = []
        for i in range(self.seq_len):
            # If no neighbors..
            if not len(self.neigh_idx[i]):
                self.neigh_tensor.append(np.empty((0,3)))
                continue

            for j in range(self.num_repeat):
                # Only include the rows of the tensor that correspond to
                # the consolidated neighbor list
                idx = [i0 for i0, j0 in enumerate(self.proteins[j].neigh_idx[i]) if j0 in self.neigh_idx[i]]

                if not j:
                    # Do not rotate the first example for residue j
                    # Initialize the list
                    tensor = [self.proteins[j].neigh_tensor[i][idx]]

                else:
                    # Rotate tensor so that it matches the first (reference) tensor
                    rotated_tensor = rotate_points(self.proteins[j].neigh_tensor[i][idx], tensor[0])
                    tensor.append(rotated_tensor)
            self.neigh_tensor.append(np.array(tensor).mean(axis=0))


    def recalculate_average_structure(self):
        """Recalculate average structure after changing parameters""" 
        for protein in self.proteins:
            changed = np.zeros(3, bool)
            for i, attr in enumerate(['min_plddt', 'max_bfactor', 'neigh_cut']):
                old, new = getattr(protein, attr), getattr(self, attr)
                if old != new:
                    setattr(protein, attr, new)
                    changed[i] = True

            # If nothing has changed, exit function
            if np.all(changed == False):
                return

            # If "min_plddt" or "max_bfactor" have changed,
            # recalculate the coords as NaN values may be different
            if np.any(changed[:2] == True):
                protein._update_nan_coords()
                protein._get_dist_mat()

            # Recalculate neighborhoods with updated neighbor cutoff
            protein.get_local_neighborhood()

        self._consolidate_neighbor_lists()
        self._rotate_and_average_neighbor_tensors()

Methods

def get_average_structure(self)

Get averaged local neighborhoods

Expand source code
def get_average_structure(self):
    """Get averaged local neighborhoods"""
    self._get_local_neighborhood()

    # Only include neighbors for each residue that are neighbors in all structures
    self._consolidate_neighbor_lists()
    
    # Rotate neighborhood tensors to arbitrary reference neighborhoods,
    # then average over repeat structures
    self._rotate_and_average_neighbor_tensors()
def recalculate_average_structure(self)

Recalculate average structure after changing parameters

Expand source code
def recalculate_average_structure(self):
    """Recalculate average structure after changing parameters""" 
    for protein in self.proteins:
        changed = np.zeros(3, bool)
        for i, attr in enumerate(['min_plddt', 'max_bfactor', 'neigh_cut']):
            old, new = getattr(protein, attr), getattr(self, attr)
            if old != new:
                setattr(protein, attr, new)
                changed[i] = True

        # If nothing has changed, exit function
        if np.all(changed == False):
            return

        # If "min_plddt" or "max_bfactor" have changed,
        # recalculate the coords as NaN values may be different
        if np.any(changed[:2] == True):
            protein._update_nan_coords()
            protein._get_dist_mat()

        # Recalculate neighborhoods with updated neighbor cutoff
        protein.get_local_neighborhood()

    self._consolidate_neighbor_lists()
    self._rotate_and_average_neighbor_tensors()
class Protein (inp_obj, **kwargs)

Protein object description:

The Protein object accepts a protein structure as input (PDB file, CIF file, atomic coordinates), and represents the protein conformation as a list of local neighborhoods per residue.

Attributes

seq_len : int
length of the protein sequence
sequence : np.ndarray(seq_len)
single-letter amino-acid sequence
coord : np.ndarray(seq_len, 3)
xyz coordinates of alpha-carbons, with nan values for missing residues and for residues excluded based on bfactor/pLDDT
coord_raw : np.ndarray(seq_len, 3)
xyz coordinates of alpha-carbons, exactly as read from file
idx : np.ndarray(seq_len)
residue indices
bfactor : np.ndarray(seq_len)
Bfactor (temperature factor); mirrors plddt
plddt : np.ndarray(seq_len)
pLDDT (AlphaFold-predicted confidence score); mirrors bfactor
dist_mat : np.ndarray(seq_len, seq_len)
alpha-carbon distance matrix
neigh_idx : list
list of neighbor indices for each residue
neigh_tensor : list
list of neighborhood tensors for each residue

Methods

get_local_neighborhood Gets the indices of neighbors for each residue (neigh_idx), and calculates neighborhood tensors (neigh_tensor).

Args

inp_obj : (str, Path, np.ndarray)

path (str, Path) to file: allowed filetypes: PDB / CIF file (.pdb, .ent, .cif) coordinate text file (.txt, .dat) coordinate numpy binary file (.npy) coordinates (np.ndarray)

Kwargs

max_bfactor : float : default = 0.0 if nonzero, excludes residues with zscore(bfactor) greater than max_bfactor

min_plddt : float : default = 0.0 if nonzero, excludes residues with pLDDT lower than min_plddt

neigh_cut : float : default = 13.0 cutoff radius used to determine neighborhood

chain : str : default = '' if not '', only parses coordinates for a specific chain

fix_pdb : bool : default = False if True, uses the SEQRES data to assign residue indices; neighborhoods of missing residues are stored as empty lists; this facilitates comparison of PDB structures that are missing different sets of residues

delimiter : str : default = ' ' delimiter for loading coordinates in text files; passed to np.loadtxt

skip_rows : int : default = 0 number of rows to skip at the top of text files; passed to np.loadtxt

Expand source code
class Protein:
    """ Protein object description:

    The Protein object accepts a protein structure as input (PDB file, CIF file, atomic coordinates),
    and represents the protein conformation as a list of local neighborhoods per residue.

    Attributes
    ----------
    seq_len : int
        length of the protein sequence

    sequence : np.ndarray(seq_len)
        single-letter amino-acid sequence

    coord : np.ndarray(seq_len, 3)
        xyz coordinates of alpha-carbons, with nan values for missing residues
        and for residues excluded based on bfactor/pLDDT
        
    coord_raw : np.ndarray(seq_len, 3)
        xyz coordinates of alpha-carbons, exactly as read from file

    idx : np.ndarray(seq_len)
        residue indices

    bfactor : np.ndarray(seq_len)
        Bfactor (temperature factor); mirrors plddt

    plddt : np.ndarray(seq_len)
        pLDDT (AlphaFold-predicted confidence score); mirrors bfactor

    dist_mat : np.ndarray(seq_len, seq_len)
        alpha-carbon distance matrix

    neigh_idx : list
        list of neighbor indices for each residue

    neigh_tensor: list
        list of neighborhood tensors for each residue


    Methods
    -------

    get_local_neighborhood
        Gets the indices of neighbors for each residue (neigh_idx),
        and calculates neighborhood tensors (neigh_tensor).

    """
    def __init__(self, inp_obj, **kwargs):
        """
        args
        ----------
        inp_obj : (str, Path, np.ndarray)
            > path (str, Path) to file:
                  allowed filetypes:
                      PDB / CIF file (.pdb, .ent, .cif)
                      coordinate text file (.txt, .dat)
                      coordinate numpy binary file (.npy)
            > coordinates (np.ndarray)

        kwargs
        ----------

        max_bfactor : float : default = 0.0
            if nonzero, excludes residues with zscore(bfactor) greater than max_bfactor

        min_plddt : float : default = 0.0
            if nonzero, excludes residues with pLDDT lower than min_plddt

        neigh_cut : float : default = 13.0
            cutoff radius used to determine neighborhood

        chain : str : default = ''
            if not '', only parses coordinates for a specific chain

        fix_pdb : bool : default = False
            if True, uses the SEQRES data to assign residue indices;
            neighborhoods of missing residues are stored as empty lists;
            this facilitates comparison of PDB structures that are missing different sets of residues

        delimiter : str : default = ' '
            delimiter for loading coordinates in text files;
            passed to np.loadtxt

        skip_rows : int : default = 0
            number of rows to skip at the top of text files;
            passed to np.loadtxt
            
        """
        # Parameters
        self.max_bfactor = kwargs.get('max_bfactor', 0.0)
        self.min_plddt = kwargs.get('min_plddt', 0.0)
        self.neigh_cut = kwargs.get('neigh_cut', 13.0)
        self.chain = kwargs.get('chain', '')
        self.fix_pdb =  kwargs.get('fix_pdb', False)
        self.delimiter = kwargs.get('delimiter', ' ')
        self.skip_rows = kwargs.get('skip_rows', 0)

        # Defaults
        self.sequence = None
        self.idx = None
        self.plddt = None
        self.bfactor = None
        self.dist_mat = None
        self.neigh_idx = []
        self.neigh_tensor = []
        self.seq_len = None         # Number of residues

        # Load data
        self._parse_input(inp_obj)
        self.get_local_neighborhood()


    def _parse_input(self, inp_obj):
        """
        Parses a range of input object types (.pdb, .ent, .cif, .txt, .dat, .npy)

        Extracts: coordinates, residue indices, amino-acid sequence, bfactor/pLDDT
        """

        if isinstance(inp_obj, (str, Path)):
            ext = Path(inp_obj).suffix

            ### Read PDB / mmCIF file
            if (ext in ['.pdb', '.cif', '.ent']):
                self._load_data_from_path(inp_obj, ext)

            ### Read text coordinate file
            elif (ext in ['.txt', '.dat']):
                try:
                    print(f"WARNING! Ambiguity in file format {ext}. Attempting to read coordinates.")
                    self.coord = np.loadtxt(inp_obj, delimiter=self.delimiter, skip_rows=self.skip_rows)
                    self.idx = np.arange(len(self.coord))
                except:
                    raise Exception("Could not read coordinates from input file!")

            ### Read nump binary coordinate file
            elif (ext == '.npy'):
                self.coord = np.load(inp_obj)
                self.idx = np.arange(len(self.coord))

        elif isinstance(inp_obj, np.ndarray):
            self.coord = inp_obj
            self.idx = np.arange(len(self.coord))

        else:
            raise Exception(f"Input object type {type(inp_obj)} is not supported.")

        if not isinstance(self.sequence, type(None)):
            self.seq_len = len(self.sequence)
        else:
            self.seq_len = len(self.coord)


    def _load_data_from_path(self, path, ext):
        """Parses data from a path, depending on the file type"""
        if self.fix_pdb:
            data = load_and_fix_pdb_data(path, self.chain)
            # (indices in PDB files are inconsistent, so we reorder
            #  the indices so that they start from zero)

        else:
            if ext in [".pdb", ".ent"]:
                data = parse_pdb_coordinates(path, self.chain)
            elif ext in [".cif"]:
                data = parse_mmcif_coordinates(path, self.chain)

            # Reorder indices so that they start from zero
            # (by default, AF-predicted structures start counting at one)
            if data[1][0] != 0:
                data[1] = data[1] - data[1][0]

        self.coord_raw = data[0]
        self.idx = data[1]
        self.sequence = data[2]
        self.bfactor = data[3]
        self.plddt = data[3].copy()

        # Fill in 'disordered' (high bfactor, low plddt) with NaN values
        self._update_nan_coords()


    # Fill in 'disordered' (high bfactor, low plddt) with NaN values
    def _update_nan_coords(self):
        """Constructs a coordinate array that includes nan values for
        missing residues and for residues excluded based on bfactor/pLDDT"""

        self.coord = np.zeros((len(self.sequence), 3)) * np.nan
        self.coord[self.idx] = self.coord_raw.copy()

        bfactor = np.zeros(len(self.sequence)) * np.nan
        bfactor[self.idx] = self.bfactor.copy()
        self.bfactor = bfactor

        if self.max_bfactor > 0.0:
            bfactor_zscore = (self.bfactor - np.nanmean(self.bfactor)) / np.nanstd(self.bfactor)
            self.coord[(bfactor_zscore > self.max_bfactor) | (np.isnan(self.bfactor))] = np.nan
        elif self.min_plddt > 0.0:
            self.coord[self.plddt <= self.min_plddt] = np.nan


    def _get_dist_mat(self):
        """Calculates alpha-carbon distance matrix"""
        self.dist_mat = cdist(self.coord, self.coord)
    

    def get_local_neighborhood(self):
        """Extracts local neighborhoods for each residue"""
        if not isinstance(self.dist_mat, np.ndarray):
            self._get_dist_mat()
        self.neigh_idx = [np.where((d > 0) & (d <= self.neigh_cut) & (np.isfinite(d)))[0] for d in self.dist_mat]
        self._calculate_neighbor_tensor()


    def _calculate_neighbor_tensor(self):
        """Calculates the neighborhood tensor for each residue"""
        self.neigh_tensor = []
        for i, idx in enumerate(self.neigh_idx):
            self.neigh_tensor.append(self.coord[idx] - self.coord[[i]])

Methods

def get_local_neighborhood(self)

Extracts local neighborhoods for each residue

Expand source code
def get_local_neighborhood(self):
    """Extracts local neighborhoods for each residue"""
    if not isinstance(self.dist_mat, np.ndarray):
        self._get_dist_mat()
    self.neigh_idx = [np.where((d > 0) & (d <= self.neigh_cut) & (np.isfinite(d)))[0] for d in self.dist_mat]
    self._calculate_neighbor_tensor()