Module PDAnalysis.pdb_parser

Expand source code
from collections import defaultdict
from pathlib import Path

from Bio import Seq, SeqIO, SeqRecord, Align
from Bio.Data import IUPACData
from Bio.PDB import PDBParser, MMCIF2Dict
import numpy as np
import pandas as pd


#############################################################
### Load coordinates from PDB file


### Read 3-letter code in title case
def parse_3letter(x):
    """Convert 3-letter amino acid code to a single-letter"""
    return IUPACData.protein_letters_3to1.get(x[0].upper() + x[1:].lower(), 'X')


def parse_pdb_coordinates(path, chain='', model=0, all_atom=False):
    """Parse coordinates, residue indices, sequence, and bfactor/plddt from a PDB file"""
    parser = PDBParser(QUIET=True)
    chains = list(parser.get_structure('', path)[model])
    coord, idx, seq, bfac = [], [], [], []

    for ch in chains:
        if (ch.get_id() == chain) or (chain == ''):
            for residue in ch:
                try:
                    h, i, _ = residue.get_id()
                    # Ignore HETATM entries
                    if h.strip() != '':
                        continue
                    aa = residue.resname
                    if all_atom:
                        for atom in residue:
                            coord.append(atom.coord)
                            idx.append(i)
                            seq.append(parse_3letter(aa))
                            bfac.append(atom.bfactor)
                    else:
                        if 'CA' not in residue:
                            continue
                        ca = residue['CA'].coord
                        coord.append(ca)
                        idx.append(i)
                        seq.append(parse_3letter(aa))
                        bfac.append(residue['CA'].bfactor)
                except Exception as e:
                    print(f"{path}\n{e}")
                    continue
    return [np.array(x) for x in [coord, idx, seq, bfac]]




#############################################################
### Load coordinates from mmCIF file


def parse_mmcif_coordinates(path, chain=''):
    """Parse coordinates, residue indices, sequence, and bfactor/plddt from a mmCIF file"""
    mmcif = reformat_mmcif_dict(MMCIF2Dict.MMCIF2Dict(path))
    df = pd.DataFrame(data=mmcif['_atom_site'])

    # Extract CA atoms
    # model '1'
    # ignore entires with incorrect sequence IDs
    # if there are multiple configurations, only choose 'A'
    df = df.loc[(df.label_atom_id=='CA')&(df.pdbx_PDB_model_num=='1')&(df.label_seq_id!='.')&(df.label_alt_id!='B')]

    # Extract specific CHAIN if arg is provided
    if len(chain):
        df = df.loc[(df.label_asym_id==chain)]

    coord = np.array([df[x].values for x in ['Cartn_x', 'Cartn_y', 'Cartn_z']]).T.astype(float)
    idx = df['label_seq_id'].values.astype(int)
    seq = np.array([parse_3letter(aa) for aa in df.label_comp_id])
    bfac = df["B_iso_or_equiv"].values.astype(float)
    return [coord, idx, seq, bfac]


def reformat_mmcif_dict(mmcif):
    """Reformat mmCIF dictionary to a nested dictionary"""
    new_dict = defaultdict(dict)
    for k, v in mmcif.items():
        if '.' not in k:
            new_dict[k] = v 
            continue
        k1 = k.split('.')[0]
        k2 = '.'.join(k.split('.')[1:])
        new_dict[k1][k2] = v 
    return new_dict



#############################################################
### Load SEQRES 

def load_pdb_seqres(path, chain=''):
    """Load SEQRES sequence from PDB file"""
    for record in SeqIO.parse(path, "pdb-seqres"):
        if not len(chain):
            return str(record.seq)
        elif record.annotations['chain'] == chain:
            return str(record.seq)


def load_mmcif_seqres(path, chain=''):
    """Load SEQRES sequence from mmCIF file"""
    mmcif = reformat_mmcif_dict(MMCIF2Dict.MMCIF2Dict(path))
    df = pd.DataFrame(data=mmcif['_pdbx_poly_seq_scheme'])

    # Extract specific CHAIN if arg is provided
    if len(chain):
        df = df.loc[(df.asym_id==chain)]

    seq = np.array([parse_3letter(aa) for aa in df.mon_id])
    return ''.join(seq)


#############################################################
### Load PDB file, reorder indices so they start from zero,
### and fill in missing coordinates / Bfactor with NaN values.


### Load SEQRES sequence, and use that to get a more complete
### description of the protein structure that preserves residues
### that are missing atoms.
### Doesn't work for some rare weird cases that you get in the PDB:
###     e.g. microheterogeneity??? (multiple amino acids for one site, somehow; eg. 1eis)
def load_and_fix_pdb_data(path, chain=''):
    """
    Read PDB / mmCIF file

    Load the SEQRES sequence, and match the atomic coordinates to
    indices based on the SEQRES sequence.
    """
    ext = Path(path).suffix

    # Load the sequence from the SEQRES part
    # Load the coords, sequence, etc., from the ATOM part
    if ext in ['.pdb', '.ent']:
        seqres = load_pdb_seqres(path, chain)
        xyz, idx, seq, bfac = parse_pdb_coordinates(path, chain)

    elif ext == '.cif':
        seqres = load_mmcif_seqres(path, chain)
        xyz, idx, seq, bfac = parse_mmcif_coordinates(path, chain)

    # If the ATOM sequence is longer than the SEQRES sequence,
    # then there is a problem
    if len(seqres) < len(seq):
        raise Exception("ERROR: the number of alpha-carbons exceeds the number of SEQRES residues!" + \
                        "\n\tPlease check your input files for errors.")

    # If there are no missing atoms, the two sequences will be equal.
    # In this case, the indices will be an integer series starting at 0
    if seqres == ''.join(seq):
        idx = np.arange(len(seq))
    else:
        # If there are missing atoms, try to align the SEQRES / ATOM sequences
        # to get the correct indices
        is_clear, idx = match_xyz_indices_to_seqres(seqres, xyz, seq)
        # If not "is_clear" (if any atom positions are ambiguous),
        # ignore ambiguous atoms
        if not is_clear:
            if len(idx):
                xyz, seq, bfac = [x[idx] for x in [xyz, seq, bfac]]
            else:
                raise Exception(f"Error reading pdb file\n\t{path}")

    # Fill in missing coordinates with nan values
    xyz_nan = np.zeros((len(seqres), 3), float) * np.nan
    xyz_nan[idx] = xyz 

    return [np.array(x) for x in [xyz, idx, list(seqres), bfac, xyz_nan]]


### Match SEQRES sequence to the sequence obtained from 
### the atomic coordinates. Do NOT allow any mismatches, only gaps.
def match_xyz_indices_to_seqres(seqres, xyz, seq):
    """Match the SEQRES sequence to the sequence obtained from ATOM entries"""
    # Find all residues that are connected along the backbone,
    # and cluster them into unbroken stretches of amino acids
    seq_clusters = find_neighbours(seq, xyz)
    # Run a strict alignment algorithm that discards candidate alignments
    # if they do not agree provide the same set of unbroken sequences
    # of amino acids (sequence clusters)
    candidates = align_sequences(seqres, ''.join(seq), seq_clusters)
    if len(candidates) > 1:
        # If there is more than one alignment, return a single index
        # that corresponds to the positions in the first alignment.
        return False, resolve_ambiguity(candidates)

    elif len(candidates) == 1:
        # If there is only one candidate...
        return True, np.where(np.array(list(candidates[0])) != '-')[0]

    else:
        # If there are no candidates identified with the strict alignment algorithm,
        # run without enforcing equivalence of sequence clusters
        candidates = align_sequences(seqres, ''.join(seq))
        if len(candidates) > 1:
            return False, resolve_ambiguity(candidates)

        elif len(candidates) == 1:
            return True, np.where(np.array(list(candidates[0])) != '-')[0]

        else:
            # If there are still no candidates, return False
            return False, []


### Distance between alpha-carbons in neighbouring
### amino acids ought to be about 3.8 AA.
### Cutoff is higher than this, due to inaccuracies in the PDB.
### See "12ca", positions 125-127 as an example
### Occasionally, this method fails because a string of disordered residues
### are missing, yet the amino acids bookending the missing residues are indexed beside
### each other; not actually that rare (e.g., 4s34_A)
def find_neighbours(seq, xyz, cut=4.3):
    """Find backbone neighbors based on their Alpha-carbon distances"""
    D = np.linalg.norm(xyz[:-1] - xyz[1:], axis=1)
    seq_clusters = []
    cluster = seq[0]
    for s, d in zip(seq[1:], D):
        if d < cut:
            cluster = cluster + s
        else:
            seq_clusters.append(cluster)
            cluster = s
    seq_clusters.append(cluster)
    return seq_clusters


def align_sequences(seqres, seq, seq_clusters=''):
    """Align SEQRES sequences to ATOM sequences"""
    candidates = []

    # Convert sequence clusters to set
    if not isinstance(seq_clusters, str):
        seqA = set(seq_clusters)

    # Loop through candidate alignments
    aligner = Align.PairwiseAligner()
    aligner.mode = 'global'
    # Set mismatch penalty to negative infinity,
    # since no mismatches are not allowed
    aligner.mismatch_score = -np.inf
    for align in aligner.align(seqres, seq):
        s1, s2 = align.sequences
        # If sequence clusters are provided, only allow candidates that
        # include all sequence clusters as unbroken sequences
        if not isinstance(seq_clusters, str):
            # Break alignment into sequence clusters by splitting
            # at gaps
            clusters = [c for c in s2.split('-') if len(c)]
            # If all of the previously identified sequence clusters
            # are found in the alignment, add the candidate to the output
            if seqA.issubset(set(clusters)):
                candidates.append(s2)
        else:
            candidates.append(s2)
    return candidates


### Discard any ambiguous residues.
# If there are ambiguous regions, they will be
# as long as the number of candidates, so we can
# account for the indices
# These cases are rare: the algorithm is only called for 3 PDB entries out of ~4000.
def resolve_ambiguity(cand):
    """Resolve ambiguous cases where there is more than one alignment between SEQRES and ATOM sequences"""
    cand = np.array([list(x) for x in cand])
    N = len(cand)
    idx = []
    icount = -1
    # Loop through sequence positions (including gaps)
    for i in range(cand.shape[1]):
        # If no gaps, there is no ambiguity; count the index
        if '-' not in cand[:,i]:
            icount += 1
            idx.append(icount)
        # If there are gaps, ignore the index if there is a gap
        # in the first candidate alignment
        else:
            if cand[0,i] != '-':
                icount += 1
#               idx.append(icount)
    return np.array(idx)

Functions

def align_sequences(seqres, seq, seq_clusters='')

Align SEQRES sequences to ATOM sequences

Expand source code
def align_sequences(seqres, seq, seq_clusters=''):
    """Align SEQRES sequences to ATOM sequences"""
    candidates = []

    # Convert sequence clusters to set
    if not isinstance(seq_clusters, str):
        seqA = set(seq_clusters)

    # Loop through candidate alignments
    aligner = Align.PairwiseAligner()
    aligner.mode = 'global'
    # Set mismatch penalty to negative infinity,
    # since no mismatches are not allowed
    aligner.mismatch_score = -np.inf
    for align in aligner.align(seqres, seq):
        s1, s2 = align.sequences
        # If sequence clusters are provided, only allow candidates that
        # include all sequence clusters as unbroken sequences
        if not isinstance(seq_clusters, str):
            # Break alignment into sequence clusters by splitting
            # at gaps
            clusters = [c for c in s2.split('-') if len(c)]
            # If all of the previously identified sequence clusters
            # are found in the alignment, add the candidate to the output
            if seqA.issubset(set(clusters)):
                candidates.append(s2)
        else:
            candidates.append(s2)
    return candidates
def find_neighbours(seq, xyz, cut=4.3)

Find backbone neighbors based on their Alpha-carbon distances

Expand source code
def find_neighbours(seq, xyz, cut=4.3):
    """Find backbone neighbors based on their Alpha-carbon distances"""
    D = np.linalg.norm(xyz[:-1] - xyz[1:], axis=1)
    seq_clusters = []
    cluster = seq[0]
    for s, d in zip(seq[1:], D):
        if d < cut:
            cluster = cluster + s
        else:
            seq_clusters.append(cluster)
            cluster = s
    seq_clusters.append(cluster)
    return seq_clusters
def load_and_fix_pdb_data(path, chain='')

Read PDB / mmCIF file

Load the SEQRES sequence, and match the atomic coordinates to indices based on the SEQRES sequence.

Expand source code
def load_and_fix_pdb_data(path, chain=''):
    """
    Read PDB / mmCIF file

    Load the SEQRES sequence, and match the atomic coordinates to
    indices based on the SEQRES sequence.
    """
    ext = Path(path).suffix

    # Load the sequence from the SEQRES part
    # Load the coords, sequence, etc., from the ATOM part
    if ext in ['.pdb', '.ent']:
        seqres = load_pdb_seqres(path, chain)
        xyz, idx, seq, bfac = parse_pdb_coordinates(path, chain)

    elif ext == '.cif':
        seqres = load_mmcif_seqres(path, chain)
        xyz, idx, seq, bfac = parse_mmcif_coordinates(path, chain)

    # If the ATOM sequence is longer than the SEQRES sequence,
    # then there is a problem
    if len(seqres) < len(seq):
        raise Exception("ERROR: the number of alpha-carbons exceeds the number of SEQRES residues!" + \
                        "\n\tPlease check your input files for errors.")

    # If there are no missing atoms, the two sequences will be equal.
    # In this case, the indices will be an integer series starting at 0
    if seqres == ''.join(seq):
        idx = np.arange(len(seq))
    else:
        # If there are missing atoms, try to align the SEQRES / ATOM sequences
        # to get the correct indices
        is_clear, idx = match_xyz_indices_to_seqres(seqres, xyz, seq)
        # If not "is_clear" (if any atom positions are ambiguous),
        # ignore ambiguous atoms
        if not is_clear:
            if len(idx):
                xyz, seq, bfac = [x[idx] for x in [xyz, seq, bfac]]
            else:
                raise Exception(f"Error reading pdb file\n\t{path}")

    # Fill in missing coordinates with nan values
    xyz_nan = np.zeros((len(seqres), 3), float) * np.nan
    xyz_nan[idx] = xyz 

    return [np.array(x) for x in [xyz, idx, list(seqres), bfac, xyz_nan]]
def load_mmcif_seqres(path, chain='')

Load SEQRES sequence from mmCIF file

Expand source code
def load_mmcif_seqres(path, chain=''):
    """Load SEQRES sequence from mmCIF file"""
    mmcif = reformat_mmcif_dict(MMCIF2Dict.MMCIF2Dict(path))
    df = pd.DataFrame(data=mmcif['_pdbx_poly_seq_scheme'])

    # Extract specific CHAIN if arg is provided
    if len(chain):
        df = df.loc[(df.asym_id==chain)]

    seq = np.array([parse_3letter(aa) for aa in df.mon_id])
    return ''.join(seq)
def load_pdb_seqres(path, chain='')

Load SEQRES sequence from PDB file

Expand source code
def load_pdb_seqres(path, chain=''):
    """Load SEQRES sequence from PDB file"""
    for record in SeqIO.parse(path, "pdb-seqres"):
        if not len(chain):
            return str(record.seq)
        elif record.annotations['chain'] == chain:
            return str(record.seq)
def match_xyz_indices_to_seqres(seqres, xyz, seq)

Match the SEQRES sequence to the sequence obtained from ATOM entries

Expand source code
def match_xyz_indices_to_seqres(seqres, xyz, seq):
    """Match the SEQRES sequence to the sequence obtained from ATOM entries"""
    # Find all residues that are connected along the backbone,
    # and cluster them into unbroken stretches of amino acids
    seq_clusters = find_neighbours(seq, xyz)
    # Run a strict alignment algorithm that discards candidate alignments
    # if they do not agree provide the same set of unbroken sequences
    # of amino acids (sequence clusters)
    candidates = align_sequences(seqres, ''.join(seq), seq_clusters)
    if len(candidates) > 1:
        # If there is more than one alignment, return a single index
        # that corresponds to the positions in the first alignment.
        return False, resolve_ambiguity(candidates)

    elif len(candidates) == 1:
        # If there is only one candidate...
        return True, np.where(np.array(list(candidates[0])) != '-')[0]

    else:
        # If there are no candidates identified with the strict alignment algorithm,
        # run without enforcing equivalence of sequence clusters
        candidates = align_sequences(seqres, ''.join(seq))
        if len(candidates) > 1:
            return False, resolve_ambiguity(candidates)

        elif len(candidates) == 1:
            return True, np.where(np.array(list(candidates[0])) != '-')[0]

        else:
            # If there are still no candidates, return False
            return False, []
def parse_3letter(x)

Convert 3-letter amino acid code to a single-letter

Expand source code
def parse_3letter(x):
    """Convert 3-letter amino acid code to a single-letter"""
    return IUPACData.protein_letters_3to1.get(x[0].upper() + x[1:].lower(), 'X')
def parse_mmcif_coordinates(path, chain='')

Parse coordinates, residue indices, sequence, and bfactor/plddt from a mmCIF file

Expand source code
def parse_mmcif_coordinates(path, chain=''):
    """Parse coordinates, residue indices, sequence, and bfactor/plddt from a mmCIF file"""
    mmcif = reformat_mmcif_dict(MMCIF2Dict.MMCIF2Dict(path))
    df = pd.DataFrame(data=mmcif['_atom_site'])

    # Extract CA atoms
    # model '1'
    # ignore entires with incorrect sequence IDs
    # if there are multiple configurations, only choose 'A'
    df = df.loc[(df.label_atom_id=='CA')&(df.pdbx_PDB_model_num=='1')&(df.label_seq_id!='.')&(df.label_alt_id!='B')]

    # Extract specific CHAIN if arg is provided
    if len(chain):
        df = df.loc[(df.label_asym_id==chain)]

    coord = np.array([df[x].values for x in ['Cartn_x', 'Cartn_y', 'Cartn_z']]).T.astype(float)
    idx = df['label_seq_id'].values.astype(int)
    seq = np.array([parse_3letter(aa) for aa in df.label_comp_id])
    bfac = df["B_iso_or_equiv"].values.astype(float)
    return [coord, idx, seq, bfac]
def parse_pdb_coordinates(path, chain='', model=0, all_atom=False)

Parse coordinates, residue indices, sequence, and bfactor/plddt from a PDB file

Expand source code
def parse_pdb_coordinates(path, chain='', model=0, all_atom=False):
    """Parse coordinates, residue indices, sequence, and bfactor/plddt from a PDB file"""
    parser = PDBParser(QUIET=True)
    chains = list(parser.get_structure('', path)[model])
    coord, idx, seq, bfac = [], [], [], []

    for ch in chains:
        if (ch.get_id() == chain) or (chain == ''):
            for residue in ch:
                try:
                    h, i, _ = residue.get_id()
                    # Ignore HETATM entries
                    if h.strip() != '':
                        continue
                    aa = residue.resname
                    if all_atom:
                        for atom in residue:
                            coord.append(atom.coord)
                            idx.append(i)
                            seq.append(parse_3letter(aa))
                            bfac.append(atom.bfactor)
                    else:
                        if 'CA' not in residue:
                            continue
                        ca = residue['CA'].coord
                        coord.append(ca)
                        idx.append(i)
                        seq.append(parse_3letter(aa))
                        bfac.append(residue['CA'].bfactor)
                except Exception as e:
                    print(f"{path}\n{e}")
                    continue
    return [np.array(x) for x in [coord, idx, seq, bfac]]
def reformat_mmcif_dict(mmcif)

Reformat mmCIF dictionary to a nested dictionary

Expand source code
def reformat_mmcif_dict(mmcif):
    """Reformat mmCIF dictionary to a nested dictionary"""
    new_dict = defaultdict(dict)
    for k, v in mmcif.items():
        if '.' not in k:
            new_dict[k] = v 
            continue
        k1 = k.split('.')[0]
        k2 = '.'.join(k.split('.')[1:])
        new_dict[k1][k2] = v 
    return new_dict
def resolve_ambiguity(cand)

Resolve ambiguous cases where there is more than one alignment between SEQRES and ATOM sequences

Expand source code
def resolve_ambiguity(cand):
    """Resolve ambiguous cases where there is more than one alignment between SEQRES and ATOM sequences"""
    cand = np.array([list(x) for x in cand])
    N = len(cand)
    idx = []
    icount = -1
    # Loop through sequence positions (including gaps)
    for i in range(cand.shape[1]):
        # If no gaps, there is no ambiguity; count the index
        if '-' not in cand[:,i]:
            icount += 1
            idx.append(icount)
        # If there are gaps, ignore the index if there is a gap
        # in the first candidate alignment
        else:
            if cand[0,i] != '-':
                icount += 1
#               idx.append(icount)
    return np.array(idx)