Module pychnosz.biomolecules
Biomolecule thermodynamics package for CHNOSZ.
This package provides thermodynamic calculations for biological molecules including proteins, amino acids, and other biomolecules.
Sub-modules
pychnosz.biomolecules.proteins-
Protein functions for CHNOSZ …
Functions
def add_protein(aa: pandas.core.frame.DataFrame, as_residue: bool = False) ‑> numpy.ndarray-
Expand source code
def add_protein(aa: pd.DataFrame, as_residue: bool = False) -> np.ndarray: """ Add protein amino acid compositions to thermo().protein. Parameters ---------- aa : DataFrame DataFrame with protein amino acid compositions. Must have same columns as thermo().protein as_residue : bool, default False Normalize amino acid counts by protein length Returns ------- array Row numbers of added/updated proteins in thermo().protein Examples -------- >>> import pandas as pd >>> from pychnosz import * >>> aa = pd.read_csv("POLG.csv") >>> iprotein = add_protein(aa) """ t = thermo() if t.protein is None: raise RuntimeError("Protein database not loaded. Run reset() first.") # Check that columns match if list(aa.columns) != list(t.protein.columns): raise ValueError("'aa' does not have the same columns as thermo().protein") # Check that new protein IDs are unique po = aa['protein'] + '_' + aa['organism'] idup = po.duplicated() if idup.any(): dup_proteins = po[idup].unique() raise ValueError(f"some protein IDs are duplicated: {' '.join(dup_proteins)}") # Normalize by protein length if as_residue = True if as_residue: pl = protein_length(aa) aa.iloc[:, 4:24] = aa.iloc[:, 4:24].div(pl, axis=0) # Find any protein IDs that are already present ip = pinfo(po.tolist()) if isinstance(ip, (int, np.integer)): ip = np.array([ip]) elif not isinstance(ip, np.ndarray): ip = np.array([ip]) ip_present = ~np.isnan(ip) # Now we're ready to go tp_new = t.protein.copy() # Add new proteins if not all(ip_present): new_proteins = aa[~ip_present].copy() tp_new = pd.concat([tp_new, new_proteins], ignore_index=True) # Update existing proteins if any(ip_present): valid_ip = ip[ip_present].astype(int) tp_new.iloc[valid_ip] = aa[ip_present].values # Update the protein database tp_new.reset_index(drop=True, inplace=True) t.protein = tp_new # Return the new row numbers ip_new = pinfo(po.tolist()) if isinstance(ip_new, (int, np.integer)): ip_new = np.array([ip_new]) # Print messages n_added = sum(~ip_present) n_replaced = sum(ip_present) if n_added > 0: print(f"add_protein: added {n_added} new protein(s) to thermo().protein") if n_replaced > 0: print(f"add_protein: replaced {n_replaced} existing protein(s) in thermo().protein") return ip_newAdd protein amino acid compositions to thermo().protein.
Parameters
aa:DataFrame- DataFrame with protein amino acid compositions. Must have same columns as thermo().protein
as_residue:bool, defaultFalse- Normalize amino acid counts by protein length
Returns
array- Row numbers of added/updated proteins in thermo().protein
Examples
>>> import pandas as pd >>> from pychnosz import * >>> aa = pd.read_csv("POLG.csv") >>> iprotein = add_protein(aa) def group_formulas() ‑> pandas.core.frame.DataFrame-
Expand source code
def group_formulas() -> pd.DataFrame: """ Return chemical formulas of amino acid residues. This function returns a DataFrame with the chemical formulas of H2O, the 20 amino acid sidechain groups, and the unfolded protein backbone group [UPBB]. Returns ------- DataFrame Chemical formulas with elements C, H, N, O, S as columns and residues as rows """ # Chemical formulas as a numpy array # Rows: water, [Ala], [Cys], [Asp], [Glu], [Phe], [Gly], [His], [Ile], [Lys], [Leu], # [Met], [Asn], [Pro], [Gln], [Arg], [Ser], [Thr], [Val], [Trp], [Tyr], [UPBB] # Columns: C, H, N, O, S A = np.array([ [0, 2, 0, 1, 0], # H2O [1, 3, 0, 0, 0], # [Ala] [1, 3, 0, 0, 1], # [Cys] [2, 3, 0, 2, 0], # [Asp] [3, 5, 0, 2, 0], # [Glu] [7, 7, 0, 0, 0], # [Phe] [0, 1, 0, 0, 0], # [Gly] [4, 5, 2, 0, 0], # [His] [4, 9, 0, 0, 0], # [Ile] [4, 10, 1, 0, 0], # [Lys] [4, 9, 0, 0, 0], # [Leu] [3, 7, 0, 0, 1], # [Met] [2, 4, 1, 1, 0], # [Asn] [3, 5, 0, 0, 0], # [Pro] [3, 6, 1, 1, 0], # [Gln] [4, 10, 3, 0, 0], # [Arg] [1, 3, 0, 1, 0], # [Ser] [2, 5, 0, 1, 0], # [Thr] [3, 7, 0, 0, 0], # [Val] [9, 8, 1, 0, 0], # [Trp] [7, 7, 0, 1, 0], # [Tyr] [2, 2, 1, 1, 0] # [UPBB] ]) rownames = ['H2O', '[Ala]', '[Cys]', '[Asp]', '[Glu]', '[Phe]', '[Gly]', '[His]', '[Ile]', '[Lys]', '[Leu]', '[Met]', '[Asn]', '[Pro]', '[Gln]', '[Arg]', '[Ser]', '[Thr]', '[Val]', '[Trp]', '[Tyr]', '[UPBB]'] # Add [UPBB] to the sidechain groups to get residues out = A.copy() # Add [UPBB] (last row) to each sidechain group (rows 1-20) out[1:21, :] = out[1:21, :] + A[21, :] # Create DataFrame df = pd.DataFrame(out[0:21, :], index=rownames[0:21], columns=['C', 'H', 'N', 'O', 'S']) return dfReturn chemical formulas of amino acid residues.
This function returns a DataFrame with the chemical formulas of H2O, the 20 amino acid sidechain groups, and the unfolded protein backbone group [UPBB].
Returns
DataFrame- Chemical formulas with elements C, H, N, O, S as columns and residues as rows
def ionize_aa(aa: pandas.core.frame.DataFrame,
property: str = 'Z',
T: float | numpy.ndarray = 25.0,
P: float | str | numpy.ndarray = 'Psat',
pH: float | numpy.ndarray = 7.0,
ret_val: str | None = None,
suppress_Cys: bool = False) ‑> pandas.core.frame.DataFrame-
Expand source code
def ionize_aa(aa: pd.DataFrame, property: str = "Z", T: Union[float, np.ndarray] = 25.0, P: Union[float, str, np.ndarray] = "Psat", pH: Union[float, np.ndarray] = 7.0, ret_val: Optional[str] = None, suppress_Cys: bool = False) -> pd.DataFrame: """ Calculate additive ionization properties of proteins. This function calculates the net charge or other ionization properties of proteins based on amino acid composition at specified T, P, and pH. Parameters ---------- aa : DataFrame Amino acid composition data property : str, default "Z" Property to calculate: - "Z": net charge - "A": chemical affinity - Other subcrt properties (G, H, S, Cp, V) T : float or array, default 25.0 Temperature in degrees Celsius P : float, str, or array, default "Psat" Pressure in bar, or "Psat" for saturation pH : float or array, default 7.0 pH value(s) ret_val : str, optional Return value type: - "pK": return pK values - "alpha": return degree of formation - "aavals": return amino acid values - None: return ionization property (default) suppress_Cys : bool, default False Suppress cysteine ionization Returns ------- DataFrame Ionization properties Examples -------- >>> from pychnosz import * >>> aa = pinfo(pinfo("LYSC_CHICK")) >>> Z = ionize_aa(aa, pH=7.0) """ # Ensure inputs are arrays T = np.atleast_1d(T) if isinstance(P, str): P = np.array([P] * len(T)) else: P = np.atleast_1d(P) pH_arr = np.atleast_1d(pH) # Get maximum length and replicate arrays lmax = max(len(T), len(P), len(pH_arr)) T = np.resize(T, lmax) if isinstance(P[0], str): P = np.array([P[0]] * lmax) else: P = np.resize(P, lmax) pH_arr = np.resize(pH_arr, lmax) # Turn pH into a matrix with as many columns as ionizable groups (9) pH_matrix = np.tile(pH_arr[:, np.newaxis], (1, 9)) # Charges for ionizable groups charges = np.array([-1, -1, -1, 1, 1, 1, -1, 1, -1]) charges_matrix = np.tile(charges, (lmax, 1)) # The ionizable groups neutral = ["[Cys]", "[Asp]", "[Glu]", "[His]", "[Lys]", "[Arg]", "[Tyr]", "[AABB]", "[AABB]"] charged = ["[Cys-]", "[Asp-]", "[Glu-]", "[His+]", "[Lys+]", "[Arg+]", "[Tyr-]", "[AABB+]", "[AABB-]"] # Get row numbers in OBIGT ineutral = [info(g, "aq") for g in neutral] icharged = [info(g, "aq") for g in charged] # Get unique T, P combinations pTP = [f"{t}_{p}" for t, p in zip(T, P)] unique_pTP = [] seen = set() indices = [] for i, tp in enumerate(pTP): if tp not in seen: unique_pTP.append(i) seen.add(tp) indices.append(list(seen).index(tp)) # Determine which property to calculate sprop = ["G", property] if property not in ["A", "Z"] else ["G"] # Convert T to Kelvin for subcrt TK = convert(T, "K") # Call subcrt for unique T, P combinations unique_T = TK[unique_pTP] unique_P = P[unique_pTP] all_species = ineutral + icharged sout = subcrt(all_species, T=unique_T, P=unique_P, property=sprop, convert=False) # Extract G values Gs = np.zeros((len(unique_pTP), len(all_species))) for i, spec_idx in enumerate(all_species): if isinstance(sout['out'], dict): # Single species result Gs[:, i] = sout['out']['G'] else: # Multiple species result Gs[:, i] = sout['out'][i]['G'].values # Gibbs energy difference for each group DG = Gs[:, 9:18] - Gs[:, 0:9] # Build matrix for all T, P values (including duplicates) DG_full = DG[indices, :] # Calculate pK values DG_full = DG_full * charges pK = np.zeros_like(DG_full) for i in range(pK.shape[1]): pK[:, i] = convert(DG_full[:, i], "logK", T=TK) # Return pK if requested if ret_val == "pK": return pd.DataFrame(pK, columns=charged) # Calculate alpha (degree of formation) alpha = 1 / (1 + 10 ** (charges_matrix * (pH_matrix - pK))) # Suppress cysteine ionization if requested if suppress_Cys: alpha[:, 0] = 0 # Return alpha if requested if ret_val == "alpha": return pd.DataFrame(alpha, columns=charged) # Calculate amino acid values if property == "Z": aavals = charges_matrix.copy() elif property == "A": aavals = -charges_matrix * (pH_matrix - pK) else: # Extract property values from subcrt output prop_vals = np.zeros((len(unique_pTP), len(all_species))) for i, spec_idx in enumerate(all_species): if isinstance(sout['out'], dict): prop_vals[:, i] = sout['out'][property] else: prop_vals[:, i] = sout['out'][i][property].values # Build matrix for all T, P values prop_vals_full = prop_vals[indices, :] # Property difference for each group aavals = prop_vals_full[:, 9:18] - prop_vals_full[:, 0:9] # Return aavals if requested if ret_val == "aavals": return pd.DataFrame(aavals, columns=charged) # Contribution from each group aavals = aavals * alpha # Get counts of ionizable groups from aa # Columns: Cys, Asp, Glu, His, Lys, Arg, Tyr, chains, chains ionize_cols = ["Cys", "Asp", "Glu", "His", "Lys", "Arg", "Tyr", "chains", "chains"] aa_counts = aa[ionize_cols].values.astype(float) # Calculate total ionization property out = np.dot(aavals, aa_counts.T) # Create DataFrame result = pd.DataFrame(out) return resultCalculate additive ionization properties of proteins.
This function calculates the net charge or other ionization properties of proteins based on amino acid composition at specified T, P, and pH.
Parameters
aa:DataFrame- Amino acid composition data
property:str, default"Z"- Property to calculate: - "Z": net charge - "A": chemical affinity - Other subcrt properties (G, H, S, Cp, V)
T:floatorarray, default25.0- Temperature in degrees Celsius
P:float, str,orarray, default"Psat"- Pressure in bar, or "Psat" for saturation
pH:floatorarray, default7.0- pH value(s)
ret_val:str, optional- Return value type: - "pK": return pK values - "alpha": return degree of formation - "aavals": return amino acid values - None: return ionization property (default)
suppress_Cys:bool, defaultFalse- Suppress cysteine ionization
Returns
DataFrame- Ionization properties
Examples
>>> from pychnosz import * >>> aa = pinfo(pinfo("LYSC_CHICK")) >>> Z = ionize_aa(aa, pH=7.0) def pinfo(protein: str | int | pandas.core.frame.DataFrame | List,
organism: str | None = None,
residue: bool = False,
regexp: bool = False) ‑> pandas.core.frame.DataFrame | numpy.ndarray | int-
Expand source code
def pinfo(protein: Union[str, int, pd.DataFrame, List], organism: Optional[str] = None, residue: bool = False, regexp: bool = False) -> Union[pd.DataFrame, np.ndarray, int]: """ Get protein information from thermo().protein. This function retrieves protein data from the thermodynamic database. The behavior depends on the input type: - DataFrame: returns the DataFrame (possibly per residue) - int or list of ints: returns rows from thermo().protein - str: searches for protein by name, returns row number(s) Parameters ---------- protein : str, int, DataFrame, or list Protein identifier(s) or data organism : str, optional Organism identifier (used with protein name) residue : bool, default False Return per-residue amino acid composition regexp : bool, default False Use regular expression matching for protein search Returns ------- DataFrame, array, or int Protein information or row numbers Examples -------- >>> # Get protein by name >>> iprotein = pinfo("LYSC_CHICK") >>> # Get protein data by row number >>> protein_data = pinfo(iprotein) """ t_p = thermo().protein if t_p is None: raise RuntimeError("Protein database not loaded. Run reset() first.") # If input is a DataFrame, return it (possibly per residue) if isinstance(protein, pd.DataFrame): out = protein.copy() if residue: # Normalize by total amino acid count (columns 5:25) row_sums = out.iloc[:, 5:25].sum(axis=1) out.iloc[:, 4:24] = out.iloc[:, 4:24].div(row_sums, axis=0) return out # If input is numeric, get rows from thermo().protein if isinstance(protein, (int, np.integer)): protein = [protein] if isinstance(protein, (list, np.ndarray)) and all(isinstance(x, (int, np.integer)) for x in protein): # Get amino acid counts iproteins = list(range(len(t_p))) # Replace invalid indices with NaN protein_clean = [p if p in iproteins else np.nan for p in protein] # Filter out NaN values for indexing valid_indices = [p for p in protein_clean if not np.isnan(p)] if not valid_indices: return pd.DataFrame() out = t_p.iloc[valid_indices].copy() # Compute per-residue counts if requested if residue: row_sums = out.iloc[:, 5:25].sum(axis=1) out.iloc[:, 4:24] = out.iloc[:, 4:24].div(row_sums, axis=0) return out # If input is string or list of strings, search for protein if isinstance(protein, str): protein = [protein] if isinstance(protein, list) and all(isinstance(x, str) for x in protein): # Search for protein or protein_organism in thermo().protein t_p_names = t_p['protein'] + '_' + t_p['organism'] if regexp: # Use regular expression matching matches = [] for prot in protein: iprotein = t_p['protein'].str.contains(prot, regex=True, na=False) if organism is not None: iorganism = t_p['organism'].str.contains(organism, regex=True, na=False) iprotein = iprotein & iorganism indices = np.where(iprotein)[0] if len(indices) > 0: matches.extend(indices.tolist()) else: matches.append(np.nan) if len(matches) == 1: if np.isnan(matches[0]): return np.nan return int(matches[0]) return np.array(matches) else: # Exact matching if organism is None: my_names = protein else: my_names = [f"{p}_{organism}" for p in protein] # Find matches matches = [] for name in my_names: idx = np.where(t_p_names == name)[0] if len(idx) > 0: matches.append(idx[0]) else: matches.append(np.nan) if len(matches) == 1: if np.isnan(matches[0]): return np.nan return int(matches[0]) return np.array(matches) raise TypeError(f"Unsupported protein type: {type(protein)}")Get protein information from thermo().protein.
This function retrieves protein data from the thermodynamic database. The behavior depends on the input type: - DataFrame: returns the DataFrame (possibly per residue) - int or list of ints: returns rows from thermo().protein - str: searches for protein by name, returns row number(s)
Parameters
protein:str, int, DataFrame,orlist- Protein identifier(s) or data
organism:str, optional- Organism identifier (used with protein name)
residue:bool, defaultFalse- Return per-residue amino acid composition
regexp:bool, defaultFalse- Use regular expression matching for protein search
Returns
DataFrame, array,orint- Protein information or row numbers
Examples
>>> # Get protein by name >>> iprotein = pinfo("LYSC_CHICK") >>> # Get protein data by row number >>> protein_data = pinfo(iprotein) def protein_OBIGT(protein: int | List[int] | pandas.core.frame.DataFrame,
organism: str | None = None,
state: str | None = None) ‑> pandas.core.frame.DataFrame-
Expand source code
def protein_OBIGT(protein: Union[int, List[int], pd.DataFrame], organism: Optional[str] = None, state: Optional[str] = None) -> pd.DataFrame: """ Calculate protein properties using group additivity. This function calculates thermodynamic properties of proteins from amino acid composition using the group additivity approach. Parameters ---------- protein : int, list of int, or DataFrame Protein identifier(s) or amino acid composition data organism : str, optional Organism identifier state : str, optional Physical state ('aq' or 'cr'). If None, uses thermo().opt['state'] Returns ------- DataFrame Thermodynamic properties in OBIGT format Examples -------- >>> iprotein = pinfo("LYSC_CHICK") >>> props = protein_OBIGT(iprotein) """ # Get amino acid composition aa = pinfo(pinfo(protein, organism)) if not isinstance(aa, pd.DataFrame): raise TypeError("Could not retrieve protein data") # Get state if state is None: state = thermo().opt.get('state', 'aq') # The names of the protein backbone groups depend on the state # [UPBB] for aq or [PBB] for cr if state == 'aq': bbgroup = 'UPBB' else: bbgroup = 'PBB' # Names of the AABB, sidechain and protein backbone groups aa_cols = aa.columns[5:25].tolist() # Get amino acid column names groups = ['AABB'] + aa_cols + [bbgroup] # Put brackets around the group names groups = [f"[{g}]" for g in groups] # The row numbers of the groups in thermo().OBIGT from ..core.info import info groups_state = [f"{g}" for g in groups] obigt = thermo().obigt # Find groups in OBIGT igroup = [] for group_name in groups_state: # Search for the group with the specified state matches = obigt[(obigt['name'] == group_name) & (obigt['state'] == state)] if len(matches) > 0: igroup.append(matches.index[0]) else: # Try without brackets if not found group_alt = group_name.strip('[]') matches = obigt[(obigt['name'] == group_alt) & (obigt['state'] == state)] if len(matches) > 0: igroup.append(matches.index[0]) else: raise ValueError(f"Group {group_name} not found in OBIGT for state {state}") # The properties are in columns 9:21 of thermo().OBIGT (G, H, S, Cp, V, etc.) # Column indices: G=9, H=10, S=11, Cp=12, V=13, a1.a=14, a2.b=15, a3.c=16, a4.d=17, c1.e=18, c2.f=19, omega.lambda=20, z.T=21 groupprops = obigt.loc[igroup, obigt.columns[9:22]] # The elements in each of the groups groupelements = i2A(igroup) results = [] # Process each protein for idx in range(len(aa)): aa_row = aa.iloc[idx] # Numbers of groups: chains [=AABB], sidechains, protein backbone nchains = float(aa_row.iloc[4]) # chains column length = float(aa_row.iloc[5:25].sum()) # sum of amino acids npbb = length - nchains # Create ngroups array ngroups = np.array([nchains] + aa_row.iloc[5:25].tolist() + [npbb], dtype=float) # Calculate thermodynamic properties by group additivity eos = (groupprops.values * ngroups[:, np.newaxis]).sum(axis=0) # Calculate formula f_in = (groupelements.values * ngroups[:, np.newaxis]).sum(axis=0).round(3) # Remove elements that don't appear element_names = groupelements.columns f_dict = {elem: f_in[i] for i, elem in enumerate(element_names) if f_in[i] != 0} # Turn it into a formula string f = as_chemical_formula(f_dict) # Species name name = f"{aa_row['protein']}_{aa_row['organism']}" # Print message print(f"protein_OBIGT: found {name} ({f}, {round(length, 3)} residues)") ref = aa_row['ref'] # Include 'model' column model = 'HKF' if state == 'aq' else 'CGL' # Create header header = { 'name': name, 'abbrv': None, 'formula': f, 'state': state, 'ref1': ref, 'ref2': None, 'date': None, 'model': model, 'E_units': 'cal' } # Combine header and eos eosout = {**header, **dict(zip(groupprops.columns, eos))} results.append(eosout) # Convert to DataFrame out = pd.DataFrame(results) out.reset_index(drop=True, inplace=True) return outCalculate protein properties using group additivity.
This function calculates thermodynamic properties of proteins from amino acid composition using the group additivity approach.
Parameters
protein:int, listofint,orDataFrame- Protein identifier(s) or amino acid composition data
organism:str, optional- Organism identifier
state:str, optional- Physical state ('aq' or 'cr'). If None, uses thermo().opt['state']
Returns
DataFrame- Thermodynamic properties in OBIGT format
Examples
>>> iprotein = pinfo("LYSC_CHICK") >>> props = protein_OBIGT(iprotein) def protein_basis(protein: int | List[int] | pandas.core.frame.DataFrame,
T: float = 25.0,
normalize: bool = False) ‑> pandas.core.frame.DataFrame-
Expand source code
def protein_basis(protein: Union[int, List[int], pd.DataFrame], T: float = 25.0, normalize: bool = False) -> pd.DataFrame: """ Calculate coefficients of basis species in protein formation reactions. Parameters ---------- protein : int, list of int, or DataFrame Protein identifier(s) or amino acid composition data T : float, default 25.0 Temperature in degrees Celsius normalize : bool, default False Normalize by protein length Returns ------- DataFrame Coefficients of basis species Examples -------- >>> from pychnosz import * >>> basis("CHNOSe") >>> iprotein = pinfo("LYSC_CHICK") >>> coeffs = protein_basis(iprotein) """ # Get amino acid composition aa = pinfo(pinfo(protein)) if not isinstance(aa, pd.DataFrame): raise TypeError("Could not retrieve protein data") # Get protein formulas pf = protein_formula(aa) # Calculate coefficients of basis species in formation reactions sb = species_basis(pf) # Calculate ionization states if H+ is a basis species t = thermo() if t.basis is not None: basis_species = t.basis.index.tolist() if 'H+' in basis_species: iHplus = basis_species.index('H+') pH = -t.basis.loc['H+', 'logact'] Z = ionize_aa(aa, T=T, pH=pH).iloc[0, :] sb.iloc[:, iHplus] = sb.iloc[:, iHplus] + Z.values # Normalize by length if requested if normalize: plen = protein_length(aa) sb = sb.div(plen, axis=0) return sbCalculate coefficients of basis species in protein formation reactions.
Parameters
protein:int, listofint,orDataFrame- Protein identifier(s) or amino acid composition data
T:float, default25.0- Temperature in degrees Celsius
normalize:bool, defaultFalse- Normalize by protein length
Returns
DataFrame- Coefficients of basis species
Examples
>>> from pychnosz import * >>> basis("CHNOSe") >>> iprotein = pinfo("LYSC_CHICK") >>> coeffs = protein_basis(iprotein) def protein_formula(protein: int | List[int] | pandas.core.frame.DataFrame,
organism: str | None = None,
residue: bool = False) ‑> pandas.core.frame.DataFrame-
Expand source code
def protein_formula(protein: Union[int, List[int], pd.DataFrame], organism: Optional[str] = None, residue: bool = False) -> pd.DataFrame: """ Calculate chemical formulas of proteins. Parameters ---------- protein : int, list of int, or DataFrame Protein identifier(s) or amino acid composition data organism : str, optional Organism identifier (used with protein number) residue : bool, default False Return per-residue formula Returns ------- DataFrame Chemical formulas with elements C, H, N, O, S as columns Examples -------- >>> iprotein = pinfo("LYSC_CHICK") >>> formula = protein_formula(iprotein) """ # Get amino acid composition aa = pinfo(pinfo(protein, organism)) if not isinstance(aa, pd.DataFrame): raise TypeError("Could not retrieve protein data") # Get group formulas rf = group_formulas() # Matrix multiplication: amino acid counts * residue formulas # Columns 5:25 contain amino acid counts (excluding chains column at 4) # We need to add H2O (chains column) separately aa_counts = aa.iloc[:, 5:25].values.astype(float) chains = aa.iloc[:, 4].values.astype(float) rf_values = rf.iloc[1:, :].values.astype(float) # Skip H2O row, use amino acid residues rf_H2O = rf.iloc[0, :].values.astype(float) # H2O row # Calculate protein formula: amino acids + H2O for chains out = np.dot(aa_counts, rf_values) + np.outer(chains, rf_H2O) # Normalize by residue if requested if residue: row_sums = aa.iloc[:, 5:25].sum(axis=1).values out = out / row_sums[:, np.newaxis] # Create DataFrame with protein names as index protein_names = aa['protein'] + '_' + aa['organism'] # Make names unique if there are duplicates if protein_names.duplicated().any(): counts = {} unique_names = [] for name in protein_names: if name in counts: counts[name] += 1 unique_names.append(f"{name}.{counts[name]}") else: counts[name] = 0 unique_names.append(name) protein_names = unique_names result = pd.DataFrame(out, index=protein_names, columns=['C', 'H', 'N', 'O', 'S']) return resultCalculate chemical formulas of proteins.
Parameters
protein:int, listofint,orDataFrame- Protein identifier(s) or amino acid composition data
organism:str, optional- Organism identifier (used with protein number)
residue:bool, defaultFalse- Return per-residue formula
Returns
DataFrame- Chemical formulas with elements C, H, N, O, S as columns
Examples
>>> iprotein = pinfo("LYSC_CHICK") >>> formula = protein_formula(iprotein) def protein_length(protein: int | List[int] | pandas.core.frame.DataFrame,
organism: str | None = None) ‑> int | numpy.ndarray-
Expand source code
def protein_length(protein: Union[int, List[int], pd.DataFrame], organism: Optional[str] = None) -> Union[int, np.ndarray]: """ Calculate the length(s) of proteins. Parameters ---------- protein : int, list of int, or DataFrame Protein identifier(s) or amino acid composition data organism : str, optional Organism identifier (used with protein number) Returns ------- int or array Protein length(s) in amino acid residues Examples -------- >>> iprotein = pinfo("LYSC_CHICK") >>> length = protein_length(iprotein) """ # Get amino acid composition aa = pinfo(pinfo(protein, organism)) if isinstance(aa, pd.DataFrame): # Use sum on the columns containing amino acid counts (columns 5:25) pl = aa.iloc[:, 5:25].sum(axis=1).values return pl else: return 0Calculate the length(s) of proteins.
Parameters
protein:int, listofint,orDataFrame- Protein identifier(s) or amino acid composition data
organism:str, optional- Organism identifier (used with protein number)
Returns
intorarray- Protein length(s) in amino acid residues
Examples
>>> iprotein = pinfo("LYSC_CHICK") >>> length = protein_length(iprotein)