Skip to main content

Documentation Index

Fetch the complete documentation index at: https://mintlify.com/google-deepmind/alphafold3/llms.txt

Use this file to discover all available pages before exploring further.

Overview

The MSA (Multiple Sequence Alignment) module provides tools for generating, processing, and featurising multiple sequence alignments. MSAs capture evolutionary information that AlphaFold 3 uses to predict protein structure.

Classes

Msa

Container for multiple sequence alignments with manipulation methods.
class Msa:
    def __init__(
        self,
        query_sequence: str,
        chain_poly_type: str,
        sequences: Sequence[str],
        descriptions: Sequence[str],
        deduplicate: bool = True,
    )
query_sequence
str
required
The sequence used to search for the MSA.
chain_poly_type
str
required
Polymer type of the query sequence (see mmcif_names for valid types: PROTEIN_CHAIN, RNA_CHAIN, DNA_CHAIN).
sequences
Sequence[str]
required
MSA sequences from search tool. First sequence must match query in featurised form. Empty sequences default to query only.
descriptions
Sequence[str]
required
Metadata for each MSA sequence. Must match length of sequences.
deduplicate
bool
default:"True"
Whether to deduplicate MSA sequences in input order. Lowercase letters (insertions) are ignored during deduplication.
Properties:
depth
int
Number of sequences in the MSA.
query_sequence
str
The original query sequence.
chain_poly_type
str
The polymer type of the sequences.
sequences
list[str]
List of MSA sequences.
descriptions
list[str]
List of sequence descriptions.
Example:
from alphafold3.data import msa
from alphafold3.constants import mmcif_names

# Create MSA from sequences
query_seq = "MKTAYIAKQRQISFVKSHFSRQLEERLGLIEVQAPILSRVGDGTQDNLSGAEKAVQVKVKALPDAQFEVVHSLAKWKRQTLGQHDFSAGEGLYTHMKALRPDEDRLSPLHSVYVDQWDWERVMGDGERQFSTLKSTVEAIWAGIKATEAAVSEEFGLAPFLPDQIHFVHSQELLSRYPDLDAKGRERAIAKDLGAVFLVGIGGKLSDGHRHDVRAPDYDDWSTPSELGHAGLNGDILVWNPVLEDAFELSSMGIRVDADTLKHQLALTGDEDRLELEWHQALLRGEMPQTIGGGIGQSRLTMLLLQLPHIGQVQAGVWPAAVRESVPSLL"

sequences = [
    query_seq,
    "MKTAYIAKQRQISFVKSHFSRQLEERLGLIEVQAPILSRVGDGTQD-LSGAEKAVQVKVKALPDAQFEVVHSLAKWKRQTLGQHDFSAGEGLYTHMKALRPDEDRLSPLHSVYVDQWDWERVMGDGERQFSTLKSTVEAIWAGIKATEAAVSEEFGLAPFLPDQIHFVHSQELLSRYPDLDAKGRERAIAKDLGAVFLVGIGGKLSDGHRHDVRAPDYDDWSTPSELGHAGLNGDILVWNPVLEDAFELSSMGIRVDADTLKHQLALTGDEDRLELEWHQALLRGEMPQTIGGGIGQSRLTMLLLQLPHIGQVQAGVWPAAVRESVPSLL",
    "MKTAYIAKQRQISFVKSHFSRQLEERLGLIEVQAPILSRVGDGTQDNLSGAEKAVQVKVKALPDAQFEVVHSLAKWKRQTLGQHDFSAGEGLYTHMKALRPDEDRLSPLHSVYVDQWDWERVMGDGERQFSTLKSTVEAIWAGIKATEAAVSEEFGLAPFLPDQIHFVHSQELLSRYPDLDAKGRERaIAKDLGAVFLVGIGGKLSDGHRHDVRAPDYDDWSTPSELGHAGLNGDILVWNPVLEDAFELSSMGIRVDADTLKHQLALTGDEDRLELEWHQALLRGEMPQTIGGGIGQSRLTMLLLQLPHIGQVQAGVWPAAVRESVPSLL"
]

descriptions = [
    "Original query",
    "UniRef90_A0A123ABC1",
    "UniRef90_B1B234DEF2"
]

msa_obj = msa.Msa(
    query_sequence=query_seq,
    chain_poly_type=mmcif_names.PROTEIN_CHAIN,
    sequences=sequences,
    descriptions=descriptions,
    deduplicate=True
)

print(f"MSA depth: {msa_obj.depth}")
print(f"Polymer type: {msa_obj.chain_poly_type}")

Class Methods

from_a3m
Parse a single A3M format string and build an MSA object.
@classmethod
def from_a3m(
    cls,
    query_sequence: str,
    chain_poly_type: str,
    a3m: str,
    max_depth: int | None = None,
    deduplicate: bool = True,
) -> Self
query_sequence
str
required
The query sequence used for MSA search.
chain_poly_type
str
required
Polymer type of the sequence.
a3m
str
required
MSA in A3M format.
max_depth
int | None
Maximum number of sequences to keep. If specified and positive, crops MSA to this depth.
deduplicate
bool
default:"True"
Whether to deduplicate sequences.
Example:
a3m_string = """>query
MKTAYIAKQRQISFVKSHFSRQLEERLGLIEVQAPILSRVGDGTQDNLSGAEKAVQVKVKALPDAQ
>hit1
MKTAYIAKQRQISFVKSHFSRQLEERLGLIEVQAPILSRVGDGTQD-LSGAEKAVQVKVKALPDAQ
"""

msa_obj = msa.Msa.from_a3m(
    query_sequence="MKTAYIAKQRQISFVKSHFSRQLEERLGLIEVQAPILSRVGDGTQDNLSGAEKAVQVKVKALPDAQFEVVHSLAKWKRQTL",
    chain_poly_type=mmcif_names.PROTEIN_CHAIN,
    a3m=a3m_string,
    max_depth=5000
)
from_multiple_a3ms
Merge multiple A3M strings into a single MSA.
@classmethod
def from_multiple_a3ms(
    cls,
    a3ms: Sequence[str],
    chain_poly_type: str,
    deduplicate: bool = True,
) -> Self
a3ms
Sequence[str]
required
Multiple A3M strings from different tools/databases. Query sequences must match across all A3Ms.
chain_poly_type
str
required
Polymer type of the sequences.
deduplicate
bool
default:"True"
Whether to deduplicate merged sequences.
Example:
# Combine results from multiple databases
uniref_a3m = "..."
unclustered_a3m = "..."

msa_obj = msa.Msa.from_multiple_a3ms(
    a3ms=[uniref_a3m, unclustered_a3m],
    chain_poly_type=mmcif_names.PROTEIN_CHAIN,
    deduplicate=True
)
from_multiple_msas
Merge multiple MSA objects into one.
@classmethod
def from_multiple_msas(
    cls,
    msas: Sequence[Self],
    deduplicate: bool = True
) -> Self
msas
Sequence[Msa]
required
Multiple MSA objects. All must have matching query sequences and polymer types.
deduplicate
bool
default:"True"
Whether to deduplicate merged sequences.
from_empty
Create an empty MSA containing only the query sequence.
@classmethod
def from_empty(cls, query_sequence: str, chain_poly_type: str) -> Self
Example:
# Useful when MSA search returns no results
empty_msa = msa.Msa.from_empty(
    query_sequence="MKTAYIAKQRQISFVK",
    chain_poly_type=mmcif_names.PROTEIN_CHAIN
)
print(f"Empty MSA depth: {empty_msa.depth}")  # Output: 1

Instance Methods

to_a3m
Convert the MSA to A3M format string.
def to_a3m(self) -> str
Example:
a3m_output = msa_obj.to_a3m()
with open("output.a3m", "w") as f:
    f.write(a3m_output)
featurize
Convert MSA to numerical features for model input.
def featurize(self) -> MutableMapping[str, np.ndarray]
return
MutableMapping[str, np.ndarray]
Dictionary with keys:
  • msa: Encoded MSA sequences as integer array
  • deletion_matrix: Deletion counts at each position
  • msa_species_identifiers: Species IDs extracted from descriptions
  • num_alignments: Total number of sequences
Raises:
  • msa.Error: If sequences have different lengths after removing deletions, contain unknown codes, or if MSA is empty after alignment
Example:
try:
    features = msa_obj.featurize()
    print(f"MSA shape: {features['msa'].shape}")
    print(f"Deletion matrix shape: {features['deletion_matrix'].shape}")
    print(f"Number of alignments: {features['num_alignments']}")
except msa.Error as e:
    print(f"Featurization failed: {e}")

Functions

get_msa

Run MSA search tool and return MSA object.
def get_msa(
    target_sequence: str,
    run_config: msa_config.RunConfig,
    chain_poly_type: str,
    deduplicate: bool = False,
) -> Msa
target_sequence
str
required
The amino acid or nucleotide sequence to search.
run_config
msa_config.RunConfig
required
MSA run configuration specifying tool and parameters.
chain_poly_type
str
required
Type of chain for MSA search (protein, RNA, DNA).
deduplicate
bool
default:"False"
Whether to deduplicate sequences (insertions ignored).
return
Msa
MSA object containing aligned sequences.
Example:
from alphafold3.data import msa_config

# Configure Jackhmmer search
config = msa_config.RunConfig(
    config=msa_config.JackhmmerConfig(
        binary_path="/usr/bin/jackhmmer",
        database_config=msa_config.DatabaseConfig(path="/data/uniref90.fasta"),
        n_cpu=8,
        n_iter=1,
        e_value=0.0001,
        max_sequences=10000
    ),
    crop_size=5000
)

# Run search
msa_result = msa.get_msa(
    target_sequence="MKTAYIAKQRQISFVKSHFSRQLEERLGLIEVQAPILSRVGDGTQDNLSGAEKAVQVKVKALPDAQFEVVHSLAKWKRQTL",
    run_config=config,
    chain_poly_type=mmcif_names.PROTEIN_CHAIN,
    deduplicate=True
)

print(f"Found {msa_result.depth} sequences")

get_msa_tool

Get an MSA search tool instance from configuration.
def get_msa_tool(
    msa_tool_config: msa_config.JackhmmerConfig | msa_config.NhmmerConfig,
) -> msa_tool.MsaTool
msa_tool_config
msa_config.JackhmmerConfig | msa_config.NhmmerConfig
required
Configuration for Jackhmmer (protein) or Nhmmer (RNA/DNA) tool.
return
msa_tool.MsaTool
Configured MSA search tool instance.
Example:
# Configure tool for protein search
jackhmmer_config = msa_config.JackhmmerConfig(
    binary_path="/usr/bin/jackhmmer",
    database_config=msa_config.DatabaseConfig(path="/data/uniref90.fasta"),
    n_cpu=8,
    n_iter=1,
    e_value=0.0001,
    max_sequences=10000
)

tool = msa.get_msa_tool(jackhmmer_config)
result = tool.query("MKTAYIAKQRQISFVK")
print(result.a3m)

sequences_are_feature_equivalent

Check if two sequences produce identical features.
def sequences_are_feature_equivalent(
    sequence1: str,
    sequence2: str,
    chain_poly_type: str,
) -> bool
sequence1
str
required
First sequence to compare.
sequence2
str
required
Second sequence to compare.
chain_poly_type
str
required
Polymer type for featurisation.
return
bool
True if sequences produce identical features, False otherwise.
Example:
# Check if sequences are equivalent for modeling
seq1 = "MKTAYIAKQRQISFVK"
seq2 = "MKTAYIAKQRQISFVK"  # Identical
seq3 = "MKTAYIAKQRQISFVX"  # Different (X vs K)

print(msa.sequences_are_feature_equivalent(seq1, seq2, mmcif_names.PROTEIN_CHAIN))  # True
print(msa.sequences_are_feature_equivalent(seq1, seq3, mmcif_names.PROTEIN_CHAIN))  # False

MSA Search Tools

Jackhmmer (Protein)

Iterative sequence search using HMM profiles. Best for protein sequences. Configuration:
jackhmmer_config = msa_config.JackhmmerConfig(
    binary_path="/usr/bin/jackhmmer",
    database_config=msa_config.DatabaseConfig(path="/data/uniref90.fasta"),
    n_cpu=8,              # Number of CPUs
    n_iter=1,             # Number of iterations
    e_value=0.0001,       # E-value threshold
    z_value=None,         # Z-value for significance
    max_sequences=10000   # Maximum sequences to return
)

Nhmmer (RNA/DNA)

HMM-based search for nucleotide sequences. Used for RNA and DNA. Configuration:
nhmmer_config = msa_config.NhmmerConfig(
    binary_path="/usr/bin/nhmmer",
    hmmalign_binary_path="/usr/bin/hmmalign",
    hmmbuild_binary_path="/usr/bin/hmmbuild",
    database_config=msa_config.DatabaseConfig(path="/data/rfam.fasta"),
    n_cpu=8,
    e_value=0.001,
    max_sequences=5000,
    alphabet="rna"  # or "dna"
)

Error Handling

from alphafold3.data import msa

try:
    msa_obj = msa.Msa.from_a3m(
        query_sequence=query_seq,
        chain_poly_type=mmcif_names.PROTEIN_CHAIN,
        a3m=a3m_string
    )
    features = msa_obj.featurize()
except ValueError as e:
    # Raised for invalid inputs (mismatched sequences, etc.)
    print(f"Validation error: {e}")
except msa.Error as e:
    # Raised for MSA-specific errors (empty MSA, unknown residues, etc.)
    print(f"MSA processing error: {e}")

Build docs developers (and LLMs) love