Module pychnosz.biomolecules

Biomolecule thermodynamics package for CHNOSZ.

This package provides thermodynamic calculations for biological molecules including proteins, amino acids, and other biomolecules.

Sub-modules

pychnosz.biomolecules.proteins

Protein functions for CHNOSZ …

Functions

def add_protein(aa: pandas.core.frame.DataFrame, as_residue: bool = False) ‑> numpy.ndarray
Expand source code
def add_protein(aa: pd.DataFrame, as_residue: bool = False) -> np.ndarray:
    """
    Add protein amino acid compositions to thermo().protein.

    Parameters
    ----------
    aa : DataFrame
        DataFrame with protein amino acid compositions.
        Must have same columns as thermo().protein
    as_residue : bool, default False
        Normalize amino acid counts by protein length

    Returns
    -------
    array
        Row numbers of added/updated proteins in thermo().protein

    Examples
    --------
    >>> import pandas as pd
    >>> from pychnosz import *
    >>> aa = pd.read_csv("POLG.csv")
    >>> iprotein = add_protein(aa)
    """
    t = thermo()

    if t.protein is None:
        raise RuntimeError("Protein database not loaded. Run reset() first.")

    # Check that columns match
    if list(aa.columns) != list(t.protein.columns):
        raise ValueError("'aa' does not have the same columns as thermo().protein")

    # Check that new protein IDs are unique
    po = aa['protein'] + '_' + aa['organism']
    idup = po.duplicated()
    if idup.any():
        dup_proteins = po[idup].unique()
        raise ValueError(f"some protein IDs are duplicated: {' '.join(dup_proteins)}")

    # Normalize by protein length if as_residue = True
    if as_residue:
        pl = protein_length(aa)
        aa.iloc[:, 4:24] = aa.iloc[:, 4:24].div(pl, axis=0)

    # Find any protein IDs that are already present
    ip = pinfo(po.tolist())
    if isinstance(ip, (int, np.integer)):
        ip = np.array([ip])
    elif not isinstance(ip, np.ndarray):
        ip = np.array([ip])

    ip_present = ~np.isnan(ip)

    # Now we're ready to go
    tp_new = t.protein.copy()

    # Add new proteins
    if not all(ip_present):
        new_proteins = aa[~ip_present].copy()
        tp_new = pd.concat([tp_new, new_proteins], ignore_index=True)

    # Update existing proteins
    if any(ip_present):
        valid_ip = ip[ip_present].astype(int)
        tp_new.iloc[valid_ip] = aa[ip_present].values

    # Update the protein database
    tp_new.reset_index(drop=True, inplace=True)
    t.protein = tp_new

    # Return the new row numbers
    ip_new = pinfo(po.tolist())
    if isinstance(ip_new, (int, np.integer)):
        ip_new = np.array([ip_new])

    # Print messages
    n_added = sum(~ip_present)
    n_replaced = sum(ip_present)

    if n_added > 0:
        print(f"add_protein: added {n_added} new protein(s) to thermo().protein")
    if n_replaced > 0:
        print(f"add_protein: replaced {n_replaced} existing protein(s) in thermo().protein")

    return ip_new

Add protein amino acid compositions to thermo().protein.

Parameters

aa : DataFrame
DataFrame with protein amino acid compositions. Must have same columns as thermo().protein
as_residue : bool, default False
Normalize amino acid counts by protein length

Returns

array
Row numbers of added/updated proteins in thermo().protein

Examples

>>> import pandas as pd
>>> from pychnosz import *
>>> aa = pd.read_csv("POLG.csv")
>>> iprotein = add_protein(aa)
def group_formulas() ‑> pandas.core.frame.DataFrame
Expand source code
def group_formulas() -> pd.DataFrame:
    """
    Return chemical formulas of amino acid residues.

    This function returns a DataFrame with the chemical formulas of
    H2O, the 20 amino acid sidechain groups, and the unfolded protein
    backbone group [UPBB].

    Returns
    -------
    DataFrame
        Chemical formulas with elements C, H, N, O, S as columns
        and residues as rows
    """
    # Chemical formulas as a numpy array
    # Rows: water, [Ala], [Cys], [Asp], [Glu], [Phe], [Gly], [His], [Ile], [Lys], [Leu],
    #       [Met], [Asn], [Pro], [Gln], [Arg], [Ser], [Thr], [Val], [Trp], [Tyr], [UPBB]
    # Columns: C, H, N, O, S
    A = np.array([
        [0, 2, 0, 1, 0],      # H2O
        [1, 3, 0, 0, 0],      # [Ala]
        [1, 3, 0, 0, 1],      # [Cys]
        [2, 3, 0, 2, 0],      # [Asp]
        [3, 5, 0, 2, 0],      # [Glu]
        [7, 7, 0, 0, 0],      # [Phe]
        [0, 1, 0, 0, 0],      # [Gly]
        [4, 5, 2, 0, 0],      # [His]
        [4, 9, 0, 0, 0],      # [Ile]
        [4, 10, 1, 0, 0],     # [Lys]
        [4, 9, 0, 0, 0],      # [Leu]
        [3, 7, 0, 0, 1],      # [Met]
        [2, 4, 1, 1, 0],      # [Asn]
        [3, 5, 0, 0, 0],      # [Pro]
        [3, 6, 1, 1, 0],      # [Gln]
        [4, 10, 3, 0, 0],     # [Arg]
        [1, 3, 0, 1, 0],      # [Ser]
        [2, 5, 0, 1, 0],      # [Thr]
        [3, 7, 0, 0, 0],      # [Val]
        [9, 8, 1, 0, 0],      # [Trp]
        [7, 7, 0, 1, 0],      # [Tyr]
        [2, 2, 1, 1, 0]       # [UPBB]
    ])

    rownames = ['H2O', '[Ala]', '[Cys]', '[Asp]', '[Glu]', '[Phe]', '[Gly]',
                '[His]', '[Ile]', '[Lys]', '[Leu]', '[Met]', '[Asn]', '[Pro]',
                '[Gln]', '[Arg]', '[Ser]', '[Thr]', '[Val]', '[Trp]', '[Tyr]',
                '[UPBB]']

    # Add [UPBB] to the sidechain groups to get residues
    out = A.copy()
    # Add [UPBB] (last row) to each sidechain group (rows 1-20)
    out[1:21, :] = out[1:21, :] + A[21, :]

    # Create DataFrame
    df = pd.DataFrame(out[0:21, :],
                     index=rownames[0:21],
                     columns=['C', 'H', 'N', 'O', 'S'])

    return df

Return chemical formulas of amino acid residues.

This function returns a DataFrame with the chemical formulas of H2O, the 20 amino acid sidechain groups, and the unfolded protein backbone group [UPBB].

Returns

DataFrame
Chemical formulas with elements C, H, N, O, S as columns and residues as rows
def ionize_aa(aa: pandas.core.frame.DataFrame,
property: str = 'Z',
T: float | numpy.ndarray = 25.0,
P: float | str | numpy.ndarray = 'Psat',
pH: float | numpy.ndarray = 7.0,
ret_val: str | None = None,
suppress_Cys: bool = False) ‑> pandas.core.frame.DataFrame
Expand source code
def ionize_aa(aa: pd.DataFrame,
              property: str = "Z",
              T: Union[float, np.ndarray] = 25.0,
              P: Union[float, str, np.ndarray] = "Psat",
              pH: Union[float, np.ndarray] = 7.0,
              ret_val: Optional[str] = None,
              suppress_Cys: bool = False) -> pd.DataFrame:
    """
    Calculate additive ionization properties of proteins.

    This function calculates the net charge or other ionization properties
    of proteins based on amino acid composition at specified T, P, and pH.

    Parameters
    ----------
    aa : DataFrame
        Amino acid composition data
    property : str, default "Z"
        Property to calculate:
        - "Z": net charge
        - "A": chemical affinity
        - Other subcrt properties (G, H, S, Cp, V)
    T : float or array, default 25.0
        Temperature in degrees Celsius
    P : float, str, or array, default "Psat"
        Pressure in bar, or "Psat" for saturation
    pH : float or array, default 7.0
        pH value(s)
    ret_val : str, optional
        Return value type:
        - "pK": return pK values
        - "alpha": return degree of formation
        - "aavals": return amino acid values
        - None: return ionization property (default)
    suppress_Cys : bool, default False
        Suppress cysteine ionization

    Returns
    -------
    DataFrame
        Ionization properties

    Examples
    --------
    >>> from pychnosz import *
    >>> aa = pinfo(pinfo("LYSC_CHICK"))
    >>> Z = ionize_aa(aa, pH=7.0)
    """
    # Ensure inputs are arrays
    T = np.atleast_1d(T)
    if isinstance(P, str):
        P = np.array([P] * len(T))
    else:
        P = np.atleast_1d(P)
    pH_arr = np.atleast_1d(pH)

    # Get maximum length and replicate arrays
    lmax = max(len(T), len(P), len(pH_arr))
    T = np.resize(T, lmax)
    if isinstance(P[0], str):
        P = np.array([P[0]] * lmax)
    else:
        P = np.resize(P, lmax)
    pH_arr = np.resize(pH_arr, lmax)

    # Turn pH into a matrix with as many columns as ionizable groups (9)
    pH_matrix = np.tile(pH_arr[:, np.newaxis], (1, 9))

    # Charges for ionizable groups
    charges = np.array([-1, -1, -1, 1, 1, 1, -1, 1, -1])
    charges_matrix = np.tile(charges, (lmax, 1))

    # The ionizable groups
    neutral = ["[Cys]", "[Asp]", "[Glu]", "[His]", "[Lys]", "[Arg]", "[Tyr]", "[AABB]", "[AABB]"]
    charged = ["[Cys-]", "[Asp-]", "[Glu-]", "[His+]", "[Lys+]", "[Arg+]", "[Tyr-]", "[AABB+]", "[AABB-]"]

    # Get row numbers in OBIGT
    ineutral = [info(g, "aq") for g in neutral]
    icharged = [info(g, "aq") for g in charged]

    # Get unique T, P combinations
    pTP = [f"{t}_{p}" for t, p in zip(T, P)]
    unique_pTP = []
    seen = set()
    indices = []
    for i, tp in enumerate(pTP):
        if tp not in seen:
            unique_pTP.append(i)
            seen.add(tp)
        indices.append(list(seen).index(tp))

    # Determine which property to calculate
    sprop = ["G", property] if property not in ["A", "Z"] else ["G"]

    # Convert T to Kelvin for subcrt
    TK = convert(T, "K")

    # Call subcrt for unique T, P combinations
    unique_T = TK[unique_pTP]
    unique_P = P[unique_pTP]

    all_species = ineutral + icharged
    sout = subcrt(all_species, T=unique_T, P=unique_P, property=sprop, convert=False)

    # Extract G values
    Gs = np.zeros((len(unique_pTP), len(all_species)))
    for i, spec_idx in enumerate(all_species):
        if isinstance(sout['out'], dict):
            # Single species result
            Gs[:, i] = sout['out']['G']
        else:
            # Multiple species result
            Gs[:, i] = sout['out'][i]['G'].values

    # Gibbs energy difference for each group
    DG = Gs[:, 9:18] - Gs[:, 0:9]

    # Build matrix for all T, P values (including duplicates)
    DG_full = DG[indices, :]

    # Calculate pK values
    DG_full = DG_full * charges
    pK = np.zeros_like(DG_full)
    for i in range(pK.shape[1]):
        pK[:, i] = convert(DG_full[:, i], "logK", T=TK)

    # Return pK if requested
    if ret_val == "pK":
        return pd.DataFrame(pK, columns=charged)

    # Calculate alpha (degree of formation)
    alpha = 1 / (1 + 10 ** (charges_matrix * (pH_matrix - pK)))

    # Suppress cysteine ionization if requested
    if suppress_Cys:
        alpha[:, 0] = 0

    # Return alpha if requested
    if ret_val == "alpha":
        return pd.DataFrame(alpha, columns=charged)

    # Calculate amino acid values
    if property == "Z":
        aavals = charges_matrix.copy()
    elif property == "A":
        aavals = -charges_matrix * (pH_matrix - pK)
    else:
        # Extract property values from subcrt output
        prop_vals = np.zeros((len(unique_pTP), len(all_species)))
        for i, spec_idx in enumerate(all_species):
            if isinstance(sout['out'], dict):
                prop_vals[:, i] = sout['out'][property]
            else:
                prop_vals[:, i] = sout['out'][i][property].values

        # Build matrix for all T, P values
        prop_vals_full = prop_vals[indices, :]

        # Property difference for each group
        aavals = prop_vals_full[:, 9:18] - prop_vals_full[:, 0:9]

    # Return aavals if requested
    if ret_val == "aavals":
        return pd.DataFrame(aavals, columns=charged)

    # Contribution from each group
    aavals = aavals * alpha

    # Get counts of ionizable groups from aa
    # Columns: Cys, Asp, Glu, His, Lys, Arg, Tyr, chains, chains
    ionize_cols = ["Cys", "Asp", "Glu", "His", "Lys", "Arg", "Tyr", "chains", "chains"]
    aa_counts = aa[ionize_cols].values.astype(float)

    # Calculate total ionization property
    out = np.dot(aavals, aa_counts.T)

    # Create DataFrame
    result = pd.DataFrame(out)

    return result

Calculate additive ionization properties of proteins.

This function calculates the net charge or other ionization properties of proteins based on amino acid composition at specified T, P, and pH.

Parameters

aa : DataFrame
Amino acid composition data
property : str, default "Z"
Property to calculate: - "Z": net charge - "A": chemical affinity - Other subcrt properties (G, H, S, Cp, V)
T : float or array, default 25.0
Temperature in degrees Celsius
P : float, str, or array, default "Psat"
Pressure in bar, or "Psat" for saturation
pH : float or array, default 7.0
pH value(s)
ret_val : str, optional
Return value type: - "pK": return pK values - "alpha": return degree of formation - "aavals": return amino acid values - None: return ionization property (default)
suppress_Cys : bool, default False
Suppress cysteine ionization

Returns

DataFrame
Ionization properties

Examples

>>> from pychnosz import *
>>> aa = pinfo(pinfo("LYSC_CHICK"))
>>> Z = ionize_aa(aa, pH=7.0)
def pinfo(protein: str | int | pandas.core.frame.DataFrame | List,
organism: str | None = None,
residue: bool = False,
regexp: bool = False) ‑> pandas.core.frame.DataFrame | numpy.ndarray | int
Expand source code
def pinfo(protein: Union[str, int, pd.DataFrame, List],
          organism: Optional[str] = None,
          residue: bool = False,
          regexp: bool = False) -> Union[pd.DataFrame, np.ndarray, int]:
    """
    Get protein information from thermo().protein.

    This function retrieves protein data from the thermodynamic database.
    The behavior depends on the input type:
    - DataFrame: returns the DataFrame (possibly per residue)
    - int or list of ints: returns rows from thermo().protein
    - str: searches for protein by name, returns row number(s)

    Parameters
    ----------
    protein : str, int, DataFrame, or list
        Protein identifier(s) or data
    organism : str, optional
        Organism identifier (used with protein name)
    residue : bool, default False
        Return per-residue amino acid composition
    regexp : bool, default False
        Use regular expression matching for protein search

    Returns
    -------
    DataFrame, array, or int
        Protein information or row numbers

    Examples
    --------
    >>> # Get protein by name
    >>> iprotein = pinfo("LYSC_CHICK")
    >>> # Get protein data by row number
    >>> protein_data = pinfo(iprotein)
    """
    t_p = thermo().protein

    if t_p is None:
        raise RuntimeError("Protein database not loaded. Run reset() first.")

    # If input is a DataFrame, return it (possibly per residue)
    if isinstance(protein, pd.DataFrame):
        out = protein.copy()
        if residue:
            # Normalize by total amino acid count (columns 5:25)
            row_sums = out.iloc[:, 5:25].sum(axis=1)
            out.iloc[:, 4:24] = out.iloc[:, 4:24].div(row_sums, axis=0)
        return out

    # If input is numeric, get rows from thermo().protein
    if isinstance(protein, (int, np.integer)):
        protein = [protein]

    if isinstance(protein, (list, np.ndarray)) and all(isinstance(x, (int, np.integer)) for x in protein):
        # Get amino acid counts
        iproteins = list(range(len(t_p)))
        # Replace invalid indices with NaN
        protein_clean = [p if p in iproteins else np.nan for p in protein]
        # Filter out NaN values for indexing
        valid_indices = [p for p in protein_clean if not np.isnan(p)]

        if not valid_indices:
            return pd.DataFrame()

        out = t_p.iloc[valid_indices].copy()

        # Compute per-residue counts if requested
        if residue:
            row_sums = out.iloc[:, 5:25].sum(axis=1)
            out.iloc[:, 4:24] = out.iloc[:, 4:24].div(row_sums, axis=0)

        return out

    # If input is string or list of strings, search for protein
    if isinstance(protein, str):
        protein = [protein]

    if isinstance(protein, list) and all(isinstance(x, str) for x in protein):
        # Search for protein or protein_organism in thermo().protein
        t_p_names = t_p['protein'] + '_' + t_p['organism']

        if regexp:
            # Use regular expression matching
            matches = []
            for prot in protein:
                iprotein = t_p['protein'].str.contains(prot, regex=True, na=False)
                if organism is not None:
                    iorganism = t_p['organism'].str.contains(organism, regex=True, na=False)
                    iprotein = iprotein & iorganism
                indices = np.where(iprotein)[0]
                if len(indices) > 0:
                    matches.extend(indices.tolist())
                else:
                    matches.append(np.nan)

            if len(matches) == 1:
                if np.isnan(matches[0]):
                    return np.nan
                return int(matches[0])
            return np.array(matches)
        else:
            # Exact matching
            if organism is None:
                my_names = protein
            else:
                my_names = [f"{p}_{organism}" for p in protein]

            # Find matches
            matches = []
            for name in my_names:
                idx = np.where(t_p_names == name)[0]
                if len(idx) > 0:
                    matches.append(idx[0])
                else:
                    matches.append(np.nan)

            if len(matches) == 1:
                if np.isnan(matches[0]):
                    return np.nan
                return int(matches[0])
            return np.array(matches)

    raise TypeError(f"Unsupported protein type: {type(protein)}")

Get protein information from thermo().protein.

This function retrieves protein data from the thermodynamic database. The behavior depends on the input type: - DataFrame: returns the DataFrame (possibly per residue) - int or list of ints: returns rows from thermo().protein - str: searches for protein by name, returns row number(s)

Parameters

protein : str, int, DataFrame, or list
Protein identifier(s) or data
organism : str, optional
Organism identifier (used with protein name)
residue : bool, default False
Return per-residue amino acid composition
regexp : bool, default False
Use regular expression matching for protein search

Returns

DataFrame, array, or int
Protein information or row numbers

Examples

>>> # Get protein by name
>>> iprotein = pinfo("LYSC_CHICK")
>>> # Get protein data by row number
>>> protein_data = pinfo(iprotein)
def protein_OBIGT(protein: int | List[int] | pandas.core.frame.DataFrame,
organism: str | None = None,
state: str | None = None) ‑> pandas.core.frame.DataFrame
Expand source code
def protein_OBIGT(protein: Union[int, List[int], pd.DataFrame],
                 organism: Optional[str] = None,
                 state: Optional[str] = None) -> pd.DataFrame:
    """
    Calculate protein properties using group additivity.

    This function calculates thermodynamic properties of proteins
    from amino acid composition using the group additivity approach.

    Parameters
    ----------
    protein : int, list of int, or DataFrame
        Protein identifier(s) or amino acid composition data
    organism : str, optional
        Organism identifier
    state : str, optional
        Physical state ('aq' or 'cr'). If None, uses thermo().opt['state']

    Returns
    -------
    DataFrame
        Thermodynamic properties in OBIGT format

    Examples
    --------
    >>> iprotein = pinfo("LYSC_CHICK")
    >>> props = protein_OBIGT(iprotein)
    """
    # Get amino acid composition
    aa = pinfo(pinfo(protein, organism))

    if not isinstance(aa, pd.DataFrame):
        raise TypeError("Could not retrieve protein data")

    # Get state
    if state is None:
        state = thermo().opt.get('state', 'aq')

    # The names of the protein backbone groups depend on the state
    # [UPBB] for aq or [PBB] for cr
    if state == 'aq':
        bbgroup = 'UPBB'
    else:
        bbgroup = 'PBB'

    # Names of the AABB, sidechain and protein backbone groups
    aa_cols = aa.columns[5:25].tolist()  # Get amino acid column names
    groups = ['AABB'] + aa_cols + [bbgroup]

    # Put brackets around the group names
    groups = [f"[{g}]" for g in groups]

    # The row numbers of the groups in thermo().OBIGT
    from ..core.info import info

    groups_state = [f"{g}" for g in groups]
    obigt = thermo().obigt

    # Find groups in OBIGT
    igroup = []
    for group_name in groups_state:
        # Search for the group with the specified state
        matches = obigt[(obigt['name'] == group_name) & (obigt['state'] == state)]
        if len(matches) > 0:
            igroup.append(matches.index[0])
        else:
            # Try without brackets if not found
            group_alt = group_name.strip('[]')
            matches = obigt[(obigt['name'] == group_alt) & (obigt['state'] == state)]
            if len(matches) > 0:
                igroup.append(matches.index[0])
            else:
                raise ValueError(f"Group {group_name} not found in OBIGT for state {state}")

    # The properties are in columns 9:21 of thermo().OBIGT (G, H, S, Cp, V, etc.)
    # Column indices: G=9, H=10, S=11, Cp=12, V=13, a1.a=14, a2.b=15, a3.c=16, a4.d=17, c1.e=18, c2.f=19, omega.lambda=20, z.T=21
    groupprops = obigt.loc[igroup, obigt.columns[9:22]]

    # The elements in each of the groups
    groupelements = i2A(igroup)

    results = []

    # Process each protein
    for idx in range(len(aa)):
        aa_row = aa.iloc[idx]

        # Numbers of groups: chains [=AABB], sidechains, protein backbone
        nchains = float(aa_row.iloc[4])  # chains column
        length = float(aa_row.iloc[5:25].sum())  # sum of amino acids
        npbb = length - nchains

        # Create ngroups array
        ngroups = np.array([nchains] + aa_row.iloc[5:25].tolist() + [npbb], dtype=float)

        # Calculate thermodynamic properties by group additivity
        eos = (groupprops.values * ngroups[:, np.newaxis]).sum(axis=0)

        # Calculate formula
        f_in = (groupelements.values * ngroups[:, np.newaxis]).sum(axis=0).round(3)

        # Remove elements that don't appear
        element_names = groupelements.columns
        f_dict = {elem: f_in[i] for i, elem in enumerate(element_names) if f_in[i] != 0}

        # Turn it into a formula string
        f = as_chemical_formula(f_dict)

        # Species name
        name = f"{aa_row['protein']}_{aa_row['organism']}"

        # Print message
        print(f"protein_OBIGT: found {name} ({f}, {round(length, 3)} residues)")

        ref = aa_row['ref']

        # Include 'model' column
        model = 'HKF' if state == 'aq' else 'CGL'

        # Create header
        header = {
            'name': name,
            'abbrv': None,
            'formula': f,
            'state': state,
            'ref1': ref,
            'ref2': None,
            'date': None,
            'model': model,
            'E_units': 'cal'
        }

        # Combine header and eos
        eosout = {**header, **dict(zip(groupprops.columns, eos))}
        results.append(eosout)

    # Convert to DataFrame
    out = pd.DataFrame(results)
    out.reset_index(drop=True, inplace=True)

    return out

Calculate protein properties using group additivity.

This function calculates thermodynamic properties of proteins from amino acid composition using the group additivity approach.

Parameters

protein : int, list of int, or DataFrame
Protein identifier(s) or amino acid composition data
organism : str, optional
Organism identifier
state : str, optional
Physical state ('aq' or 'cr'). If None, uses thermo().opt['state']

Returns

DataFrame
Thermodynamic properties in OBIGT format

Examples

>>> iprotein = pinfo("LYSC_CHICK")
>>> props = protein_OBIGT(iprotein)
def protein_basis(protein: int | List[int] | pandas.core.frame.DataFrame,
T: float = 25.0,
normalize: bool = False) ‑> pandas.core.frame.DataFrame
Expand source code
def protein_basis(protein: Union[int, List[int], pd.DataFrame],
                 T: float = 25.0,
                 normalize: bool = False) -> pd.DataFrame:
    """
    Calculate coefficients of basis species in protein formation reactions.

    Parameters
    ----------
    protein : int, list of int, or DataFrame
        Protein identifier(s) or amino acid composition data
    T : float, default 25.0
        Temperature in degrees Celsius
    normalize : bool, default False
        Normalize by protein length

    Returns
    -------
    DataFrame
        Coefficients of basis species

    Examples
    --------
    >>> from pychnosz import *
    >>> basis("CHNOSe")
    >>> iprotein = pinfo("LYSC_CHICK")
    >>> coeffs = protein_basis(iprotein)
    """
    # Get amino acid composition
    aa = pinfo(pinfo(protein))

    if not isinstance(aa, pd.DataFrame):
        raise TypeError("Could not retrieve protein data")

    # Get protein formulas
    pf = protein_formula(aa)

    # Calculate coefficients of basis species in formation reactions
    sb = species_basis(pf)

    # Calculate ionization states if H+ is a basis species
    t = thermo()
    if t.basis is not None:
        basis_species = t.basis.index.tolist()
        if 'H+' in basis_species:
            iHplus = basis_species.index('H+')
            pH = -t.basis.loc['H+', 'logact']
            Z = ionize_aa(aa, T=T, pH=pH).iloc[0, :]
            sb.iloc[:, iHplus] = sb.iloc[:, iHplus] + Z.values

    # Normalize by length if requested
    if normalize:
        plen = protein_length(aa)
        sb = sb.div(plen, axis=0)

    return sb

Calculate coefficients of basis species in protein formation reactions.

Parameters

protein : int, list of int, or DataFrame
Protein identifier(s) or amino acid composition data
T : float, default 25.0
Temperature in degrees Celsius
normalize : bool, default False
Normalize by protein length

Returns

DataFrame
Coefficients of basis species

Examples

>>> from pychnosz import *
>>> basis("CHNOSe")
>>> iprotein = pinfo("LYSC_CHICK")
>>> coeffs = protein_basis(iprotein)
def protein_formula(protein: int | List[int] | pandas.core.frame.DataFrame,
organism: str | None = None,
residue: bool = False) ‑> pandas.core.frame.DataFrame
Expand source code
def protein_formula(protein: Union[int, List[int], pd.DataFrame],
                   organism: Optional[str] = None,
                   residue: bool = False) -> pd.DataFrame:
    """
    Calculate chemical formulas of proteins.

    Parameters
    ----------
    protein : int, list of int, or DataFrame
        Protein identifier(s) or amino acid composition data
    organism : str, optional
        Organism identifier (used with protein number)
    residue : bool, default False
        Return per-residue formula

    Returns
    -------
    DataFrame
        Chemical formulas with elements C, H, N, O, S as columns

    Examples
    --------
    >>> iprotein = pinfo("LYSC_CHICK")
    >>> formula = protein_formula(iprotein)
    """
    # Get amino acid composition
    aa = pinfo(pinfo(protein, organism))

    if not isinstance(aa, pd.DataFrame):
        raise TypeError("Could not retrieve protein data")

    # Get group formulas
    rf = group_formulas()

    # Matrix multiplication: amino acid counts * residue formulas
    # Columns 5:25 contain amino acid counts (excluding chains column at 4)
    # We need to add H2O (chains column) separately
    aa_counts = aa.iloc[:, 5:25].values.astype(float)
    chains = aa.iloc[:, 4].values.astype(float)
    rf_values = rf.iloc[1:, :].values.astype(float)  # Skip H2O row, use amino acid residues
    rf_H2O = rf.iloc[0, :].values.astype(float)  # H2O row

    # Calculate protein formula: amino acids + H2O for chains
    out = np.dot(aa_counts, rf_values) + np.outer(chains, rf_H2O)

    # Normalize by residue if requested
    if residue:
        row_sums = aa.iloc[:, 5:25].sum(axis=1).values
        out = out / row_sums[:, np.newaxis]

    # Create DataFrame with protein names as index
    protein_names = aa['protein'] + '_' + aa['organism']
    # Make names unique if there are duplicates
    if protein_names.duplicated().any():
        counts = {}
        unique_names = []
        for name in protein_names:
            if name in counts:
                counts[name] += 1
                unique_names.append(f"{name}.{counts[name]}")
            else:
                counts[name] = 0
                unique_names.append(name)
        protein_names = unique_names

    result = pd.DataFrame(out,
                         index=protein_names,
                         columns=['C', 'H', 'N', 'O', 'S'])

    return result

Calculate chemical formulas of proteins.

Parameters

protein : int, list of int, or DataFrame
Protein identifier(s) or amino acid composition data
organism : str, optional
Organism identifier (used with protein number)
residue : bool, default False
Return per-residue formula

Returns

DataFrame
Chemical formulas with elements C, H, N, O, S as columns

Examples

>>> iprotein = pinfo("LYSC_CHICK")
>>> formula = protein_formula(iprotein)
def protein_length(protein: int | List[int] | pandas.core.frame.DataFrame,
organism: str | None = None) ‑> int | numpy.ndarray
Expand source code
def protein_length(protein: Union[int, List[int], pd.DataFrame],
                   organism: Optional[str] = None) -> Union[int, np.ndarray]:
    """
    Calculate the length(s) of proteins.

    Parameters
    ----------
    protein : int, list of int, or DataFrame
        Protein identifier(s) or amino acid composition data
    organism : str, optional
        Organism identifier (used with protein number)

    Returns
    -------
    int or array
        Protein length(s) in amino acid residues

    Examples
    --------
    >>> iprotein = pinfo("LYSC_CHICK")
    >>> length = protein_length(iprotein)
    """
    # Get amino acid composition
    aa = pinfo(pinfo(protein, organism))

    if isinstance(aa, pd.DataFrame):
        # Use sum on the columns containing amino acid counts (columns 5:25)
        pl = aa.iloc[:, 5:25].sum(axis=1).values
        return pl
    else:
        return 0

Calculate the length(s) of proteins.

Parameters

protein : int, list of int, or DataFrame
Protein identifier(s) or amino acid composition data
organism : str, optional
Organism identifier (used with protein number)

Returns

int or array
Protein length(s) in amino acid residues

Examples

>>> iprotein = pinfo("LYSC_CHICK")
>>> length = protein_length(iprotein)